<a href="https://colab.research.google.com/github/hakim3189/MachineLearning/blob/main/Pelatihan_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import re
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
main_df = pd.read_csv('/content/drive/MyDrive/ulasan_aplikasi_mytelkomsel.csv')

In [4]:
main_df.head()

Unnamed: 0,Review
0,Makin hari makin BURUK! segala upaya biar lanc...
1,Perbaiki dulu jaringannya paketnya sudah mahal...
2,Aku gak ngerti ya! Lama2 aplikasinya makin nge...
3,"Sebenarnya sudah mulai bosan, sebab masalahnya..."
4,"makin kesini aplikasi makin berat, buka aplika..."


Case Folding

In [5]:
main_df.loc[:, 'Review'] = main_df.loc[:, 'Review'].str.lower()
main_df['Review']

Unnamed: 0,Review
0,makin hari makin buruk! segala upaya biar lanc...
1,perbaiki dulu jaringannya paketnya sudah mahal...
2,aku gak ngerti ya! lama2 aplikasinya makin nge...
3,"sebenarnya sudah mulai bosan, sebab masalahnya..."
4,"makin kesini aplikasi makin berat, buka aplika..."
...,...
121495,"aplikasi trs yg di update bkn kualitas sinyal,..."
121496,"aplikasi indihome di alihkan ke sini ,tapi log..."
121497,"sekarang semakin gk bermutu nih aplikasi, pake..."
121498,soal sinyal memang lebih dari yg lain tapi har...


Removal Special Characters

In [6]:
#Menghapus Angka
for i in range(len(main_df)):
    main_df.loc[i, 'Review'] = re.sub(r'\d+', '', str(main_df.loc[i, 'Review']))
main_df['Review']

# Menghapus Tanda Baca
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for i in range(len(main_df)):
    text = main_df.loc[i, 'Review']
    for char in punc:
        text = text.replace(char, '')
    main_df.loc[i, 'Review'] = text
main_df['Review']

# Menghapus Mention dan Hastag
for i in range(len(main_df)):
    main_df.loc[i, 'Review'] = re.sub(r'@[A-Za-z0-9]+', '', main_df.loc[i, 'Review'])
    main_df.loc[i, 'Review'] = re.sub(r'#[A-Za-z0-9]+', '', main_df.loc[i, 'Review'])
main_df['Review']

# Menghapus dan Spasi
main_df['Review'] = main_df.loc[:, 'Review'].str.strip()
main_df['Review']

Unnamed: 0,Review
0,makin hari makin buruk segala upaya biar lanca...
1,perbaiki dulu jaringannya paketnya sudah mahal...
2,aku gak ngerti ya lama aplikasinya makin ngele...
3,sebenarnya sudah mulai bosan sebab masalahnya ...
4,makin kesini aplikasi makin berat buka aplikas...
...,...
121495,aplikasi trs yg di update bkn kualitas sinyal ...
121496,aplikasi indihome di alihkan ke sini tapi logi...
121497,sekarang semakin gk bermutu nih aplikasi paket...
121498,soal sinyal memang lebih dari yg lain tapi har...


Slangword Subtitute

In [7]:
slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal", "yg": "yang", "bkn": "bukan"}

def fix_slangwords(text):
    # Check if text is a list and join it into a string if it is
    if isinstance(text, list):
        text = ' '.join(text)

    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
main_df['Review'] = main_df['Review'].apply(fix_slangwords)
main_df['Review']

Unnamed: 0,Review
0,makin hari makin buruk segala upaya biar lanca...
1,perbaiki dulu jaringannya paketnya sudah mahal...
2,aku gak ngerti ya lama aplikasinya makin ngele...
3,sebenarnya sudah mulai bosan sebab masalahnya ...
4,makin kesini aplikasi makin berat buka aplikas...
...,...
121495,aplikasi trs yang di update bukan kualitas sin...
121496,aplikasi indihome di alihkan ke sini tapi logi...
121497,sekarang semakin gk bermutu nih aplikasi paket...
121498,soal sinyal memang lebih dari yang lain tapi h...


Labeling

In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

sentiments = SentimentIntensityAnalyzer()
main_df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in main_df["Review"]]
main_df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in main_df["Review"]]
main_df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in main_df["Review"]]
main_df['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in main_df["Review"]]
main_df.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Review,Positive,Negative,Neutral,Compound
0,makin hari makin buruk segala upaya biar lanca...,0.044,0.0,0.956,0.3818
1,perbaiki dulu jaringannya paketnya sudah mahal...,0.0,0.0,1.0,0.0
2,aku gak ngerti ya lama aplikasinya makin ngele...,0.0,0.0,1.0,0.0
3,sebenarnya sudah mulai bosan sebab masalahnya ...,0.0,0.0,1.0,0.0
4,makin kesini aplikasi makin berat buka aplikas...,0.0,0.0,1.0,0.0


In [9]:
score = main_df["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
main_df["Sentiment"] = sentiment
main_df.head()

print(main_df['Sentiment'].value_counts())

Sentiment
Neutral     102061
Negative     10472
Positive      8967
Name: count, dtype: int64


In [10]:
import csv
import requests
from io import StringIO

# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub

if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma

    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
else:
    print("Failed to fetch positive lexicon data")

# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub

if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma

    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
else:
    print("Failed to fetch negative lexicon data")

In [11]:
def sentiment_analysis_lexicon_indonesia(text):
    #for word in text:

    score = 0
    # Inisialisasi skor sentimen ke 0

    for word in text:
        # Mengulangi setiap kata dalam teks

        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen

    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)

        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen

    polarity=''
    # Inisialisasi variabel polaritas

    if (score > 0):
        polarity = 'positive'
        # Jika skor sentimen lebih besar atau sama dengan 0, maka polaritas adalah positif
    elif (score < 0):
        polarity = 'negative'
        # Jika skor sentimen kurang dari 0, maka polaritas adalah negatif
    else:
        polarity = 'neutral'
    # Ini adalah bagian yang bisa digunakan untuk menentukan polaritas netral jika diperlukan

    return score, polarity
    # Mengembalikan skor sentimen dan polaritas teks

In [12]:
results = main_df['Review'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
main_df['polarity_score'] = results[0]
main_df['polarity'] = results[1]
print(main_df['polarity'].value_counts())
main_df['polarity_score']

polarity
neutral    121500
Name: count, dtype: int64


Unnamed: 0,polarity_score
0,0
1,0
2,0
3,0
4,0
...,...
121495,0
121496,0
121497,0
121498,0


Ekstrasi Fitur

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = main_df['Review']
y = main_df['Sentiment']

tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)

features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
features_df

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [15]:
models={'LogisticRegression()':LogisticRegression(),
        'K-Nearest Neighbors': KNeighborsClassifier(),
        'Decision Tree':DecisionTreeClassifier(),
        'Support Vector Machine(Linear Kernel)':LinearSVC(),
        'Support Vector Machine(Non-Linear Kernal)':SVC(),
        'Neural Network':MLPClassifier(),
        'Random Forest':RandomForestClassifier(),
        'Gradient Boosting':GradientBoostingClassifier()}

In [None]:
for name, model in models.items():
    print(name)
    model.fit(X_train,y_train)
    print(model.score(X_test,y_test))

LogisticRegression()
0.8654732510288066
K-Nearest Neighbors
0.841440329218107
Decision Tree
0.7694650205761316
Support Vector Machine(Linear Kernel)
0.8656378600823045
Support Vector Machine(Non-Linear Kernal)
