# Proses Analisis Sentimen Pada Ulasan Aplikasi Peduli Lindungi

### Menginstall *package* yang dibutuhkan

In [None]:
!pip3 uninstall googletrans
!pip3 install googletrans==3.1.0a0

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3

### Mengimpor *library* yang dibutuhkan

In [None]:
from googletrans import Translator
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import re
import string

# Filtering
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stemming
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline

# Labeling
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Splitting data
from sklearn.model_selection import train_test_split

# Model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc, roc_curve

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


###  Mengimpor dan menampilkan data dalam bentuk tabel

In [None]:
path = 'Scrapped_data.csv'
data = pd.read_csv(path)
data.head()

FileNotFoundError: ignored

In [None]:
data.head(5827)

### Menampilkan jumlah baris dan kolom data

In [None]:
data.shape

### Menghilangkan variabel yang tidak dipakai

In [None]:
df_data = data.copy()
df_data = data.drop(columns = ['Nama reviewer','Rating','Tanggal ulasan'])


### Menerjemahkan ke dalam bahasa Inggris

In [None]:
translator = Translator()
df_data['Ulasan'] = df_data['Ulasan'].apply(translator.translate, src='id', dest='en').apply(getattr, args=('text',))

In [None]:
df_data_en = df_data.copy()
df_data_en.to_csv('data_en.csv', index=False)
df_data_en.head()

### *Data Preprocessing*
#### *Case Folding*

In [None]:
def case_folding(content):
  content = content.lower()                             # mengecilkan huruf
  content = content.strip(' ')                          # menghapus spasi diawal dan diakhir string
  content = re.sub(r'[?|$|.,|!_:)(-+)]', '', content)   # mengganti tanda ?|$ dsb dengan kosong
  content = re.sub(r'\d', '', content)                  # menghapus angka
  content = re.sub('[^\w\s]', '',content)               # menghapus punctuations
  content = re.sub('  ', ' ',content)                   # menghapus dua spasi
  return content
df_data_en['Ulasan'] = df_data_en['Ulasan'].apply(case_folding)
df_data_en.head()

In [None]:
df_data_en.to_csv('data_CF.csv', index=False)

#### *Tokenizing*

In [None]:
def token(content):
  nstr = content.split(' ')
  dat = []
  a = -1
  for hu in nstr:
    a = a+1
  if hu == '':
    dat.append(a)
  return nstr
df_data_en['Ulasan'] = df_data_en['Ulasan'].apply(token)
df_data_en.head()

In [None]:
df_data_en.to_csv('data_Toke.csv', index=False)

#### *Filtering*

In [None]:
def stopwords_removal(content):
  filtering = stopwords.words('english')
  x = []
  data = []
  def myFunc(x):
    if x in filtering:
      return False
    else:
      return True
  fit = filter(myFunc, content)
  for x in fit:
    data.append(x)
  return data
df_data_en['Ulasan'] = df_data_en['Ulasan'].apply(stopwords_removal)
df_data_en.head()

In [None]:
df_data_en.to_csv('data_Filter.csv', index=False)

#### *Stemming*

In [None]:
def stemming(content):
  stemmer = nltk.porter.PorterStemmer()
  do = []
  for w in content:
    dt = stemmer.stem(w)
    do.append(dt)
  d_clean = []
  d_clean = " ".join(do)
  return d_clean
df_data_en['Ulasan'] = df_data_en['Ulasan'].apply(stemming)

df_data_en.to_csv('data_cleans.csv', index=False)
data_cleans = pd.read_csv('data_cleans.csv', encoding='latin1')
data_cleans.head()

In [None]:
df_data_en.to_csv('data_Stem.csv', index=False)

#### Menampilkan hasil *data preprocessing*

In [None]:
data_clean = data_cleans.dropna()
data_clean.to_csv('data_clean.csv', index=False)
data_cleans = pd.read_csv('data_clean.csv', encoding='latin1')

In [None]:
data_cleans = pd.read_csv('data_clean.csv', encoding='latin1')
data_cleans.head()

In [None]:
data_cleans.sample(6, random_state = 134)

#### Mengecek data *missing value*

In [None]:
data_cleans.info()

In [None]:
len(data_cleans)

#### Melakukan pelabelan pada data ulasan aplikasi Peduli Lindungi

In [None]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("The film was awesome")

In [None]:
sia.polarity_scores("I liked this music but it is not good as the other one")

In [None]:
data_cleans["Ulasan"][0:10].apply(lambda x: sia.polarity_scores(x))

In [None]:
data_cleans['Ulasan'][0:10].apply(lambda x: sia.polarity_scores(x)["compound"])

In [None]:
data_cleans["polarity_score"] = data_cleans['Ulasan'].apply(lambda x: sia.polarity_scores(x)["compound"])
data_cleans.head()

In [None]:
data_cleans["Ulasan"][0:10].apply(lambda x: "pos" if sia.polarity_scores(x)["compound"] > 0 else "neg")

In [None]:
data_cleans["sentiment_label"] = data_cleans["Ulasan"].apply(lambda x: 1 if sia.polarity_scores(x)["compound"] > 0 else 0)
data_cleans.head()

In [None]:
data_cleans["sentiment_label"].value_counts()

In [None]:
data_cleans = data_cleans.astype({'sentiment_label': 'category'})
data_cleans = data_cleans.astype({'Ulasan': 'string'})

#### Melakukan pembobotan *TF-IDF*

In [None]:
tf = TfidfVectorizer()
text_tf = tf.fit_transform(data_cleans['Ulasan'].astype('U'))
text_tf[0]

In [None]:
tf_count = data_cleans['Ulasan'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()

tf_count.columns = ["words", "tf"]
tf_count.head()

In [None]:
tf_count[(tf_count['words'] == "drain") | (tf_count['words'] == "batteri") | (tf_count['words'] == "good") | (tf_count['words'] == "better") | (tf_count['words'] == "last") | (tf_count['words'] == "year")]

#### Membagi Data (*data training* sebanyak 80 persen dan *data testing* sebanyak 20 persen)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(text_tf, data_cleans['sentiment_label'], test_size=0.2, random_state=42)
# x1_train, x1_test, y1_train, y1_test = train_test_split(text_tf, data_cleans['sentiment_label'], test_size=0.25, random_state=42)
# x2_train, x2_test, y2_train, y2_test = train_test_split(text_tf, data_cleans['sentiment_label'], test_size=0.3, random_state=42)

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(data_cleans['Ulasan'], data_cleans['sentiment_label'], test_size=0.2, random_state=42)

In [None]:
x1_test.sample(1)

In [None]:
x1_test[5274], y1_test[5274]

In [None]:
Sentimen_testing = y1_test.to_frame().reset_index(False)
testing_pos = Sentimen_testing[Sentimen_testing['sentiment_label']==1]
testing_neg = Sentimen_testing[Sentimen_testing['sentiment_label']==0]

In [None]:
index_test_pos = []
index_test_neg = []
for i in testing_pos['index']:
  index_test_pos.append(i)
for i in testing_neg['index']:
  index_test_neg.append(i)

In [None]:
test_pos = pd.DataFrame()
for i in index_test_pos:
  test_pos.append(data_cleans['Ulasan'].iloc[i])

test_pos

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

#### Proses ADASYN

In [None]:
data_ada = data_cleans.sample(10, random_state = 46).reset_index().drop("index", axis=1)
data_ada

In [None]:
data_ada['sentiment_label'].value_counts()

In [None]:
tf = TfidfVectorizer()
text_tf_ada = tf.fit_transform(data_ada['Ulasan'].astype('U'))
text_tf_ada

In [None]:
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

k = 5

# Build K Nearest Neighbors model
knn_model = NearestNeighbors(n_neighbors=k, metric="euclidean").fit(text_tf_ada)
distances, indices = knn_model.kneighbors()


# Print the 'k' nearest neighbors
print("K Nearest Neighbors dg pusat D1:")
for rank, index in enumerate(indices[1][:k], start=1):
    print(str(rank) + " ==>", index)
print("K Nearest Neighbors dg pusat D6")
for rank, index in enumerate(indices[6][:k], start=1):
    print(str(rank) + " ==>", index)
print("K Nearest Neighbors dg pusat D9")
for rank, index in enumerate(indices[9][:k], start=1):
    print(str(rank) + " ==>", index)    

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

a = [0, 8, 7, 4, 2]
b = [4, 8, 7, 0, 5]
c = [7, 8, 4, 2, 6]
print("Jarak euclidean K Nearest Neighbors dg pusat D0:")
for i in a:
  print(str(i), "==>", euclidean_distances(text_tf_ada[0],text_tf_ada[i]))
print("Jarak euclidean K Nearest Neighbors dg pusat D1:")
for i in b:
  print(str(i), "==>", euclidean_distances(text_tf_ada[6],text_tf_ada[i]))
print("Jarak euclidean K Nearest Neighbors dg pusat D2:")
for i in c:
  print(str(i), "==>", euclidean_distances(text_tf_ada[9],text_tf_ada[i]))

## Algoritma Multinomial Naive Bayes

#### Simulasi MNB

In [None]:
# data training
data_cleans.sample(30, random_state =134).drop('polarity_score', axis = 1)

In [None]:
# data testing
data_cleans.sample(1, random_state = 300).drop('polarity_score', axis = 1)

### Imbalanced Data Modelling





In [None]:
clf = MultinomialNB().fit(x_train,y_train)
predicted = clf.predict(x_test)
false_positive_rate, true_positive_rate, tresholds = roc_curve(y_test,clf.predict(x_test))
print('MultinomialNB Accuracy: ', accuracy_score(y_test,predicted))
print('MultinomialNB Precision: ', precision_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB Recall: ', recall_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB f1-score: ', f1_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB AUC: ', auc(false_positive_rate, true_positive_rate))

print(f'confusion matrix:\n {confusion_matrix(y_test,predicted)}')
print('======================================\n')
print(classification_report(y_test,predicted,zero_division=0))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc, roc_curve

clf = MultinomialNB().fit(x1_train,y1_train)
predicted = clf.predict(x1_test)
false_positive_rate, true_positive_rate, tresholds = roc_curve(y1_test,clf.predict_proba(x1_test)[:,1])
print('MultinomialNB Accuracy: ', accuracy_score(y1_test,predicted))
print('MultinomialNB Precision: ', precision_score(y1_test,predicted, average='binary', pos_label=0))
print('MultinomialNB Recall: ', recall_score(y1_test,predicted, average='binary', pos_label=0))
print('MultinomialNB f1-score: ', f1_score(y1_test,predicted, average='binary', pos_label=0))
print('MultinomialNB AUC: ', auc(false_positive_rate, true_positive_rate))

print(f'confusion matrix:\n {confusion_matrix(y1_test,predicted)}')
print('======================================\n')
print(classification_report(y1_test,predicted,zero_division=0))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc, roc_curve

clf = MultinomialNB().fit(x2_train,y2_train)
predicted = clf.predict(x2_test)
false_positive_rate, true_positive_rate, tresholds = roc_curve(y2_test,clf.predict_proba(x2_test)[:,1])
print('MultinomialNB Accuracy: ', accuracy_score(y2_test,predicted))
print('MultinomialNB Precision: ', precision_score(y2_test,predicted, average='binary', pos_label=0))
print('MultinomialNB Recall: ', recall_score(y2_test,predicted, average='binary', pos_label=0))
print('MultinomialNB f1-score: ', f1_score(y2_test,predicted, average='binary', pos_label=0))
print('MultinomialNB AUC: ', auc(false_positive_rate, true_positive_rate))



print(f'confusion matrix:\n {confusion_matrix(y2_test,predicted)}')
print('======================================\n')
print(classification_report(y2_test,predicted,zero_division=0))

### SMOTE Modelling

In [None]:
from imblearn.over_sampling import SMOTE

x = x_train
y = y_train

print(x_train.shape)
print(y_train.shape)
print(y_train.value_counts())

smote = SMOTE(sampling_strategy= 'minority')
x_sm, y_sm =smote.fit_resample(x_train, y_train)

print(y_sm.value_counts())
print(y_test.value_counts())

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc, roc_curve

clf = MultinomialNB().fit(x_sm,y_sm)
predicted = clf.predict(x_test)
false_positive_rate, true_positive_rate, tresholds = roc_curve(y_test, clf.predict(x_test))
print('MultinomialNB Accuracy: ', accuracy_score(y_test, predicted))
print('MultinomialNB Precision: ', precision_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB Recall: ', recall_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB f1-score: ', f1_score(y_test,predicted, average='binary', pos_label=0))
#print('MultinomialNB AUC: ', roc_auc_score(y_test,clf.predict_proba(x_test)[:,1]))
print('MultinomialNB AUC: ', auc(false_positive_rate, true_positive_rate))

print(f'confusion matrix:\n {confusion_matrix(y_test,predicted)}')
print('======================================\n')
print(classification_report(y_test,predicted,zero_division=0))

In [None]:
#extracting true_positives, false_positives, true_negatives, false_negatives
tn, fp, fn, tp = confusion_matrix(y_test, predicted).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(y_test,predicted)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()

## ADASYN Modelling

In [None]:
from imblearn.over_sampling import ADASYN

x = x_train
y = y_train

print(x.shape)
print(y.shape)
print(y.value_counts())

ada = ADASYN(sampling_strategy= 'minority')
x_ada, y_ada =ada.fit_resample(x, y)

print(y_ada.value_counts())

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score,auc,roc_curve

clf = MultinomialNB().fit(x_ada,y_ada)
predicted = clf.predict(x_test)
false_positive_rate, true_positive_rate, tresholds = roc_curve(y_test,clf.predict(x_test))
print('MultinomialNB Accuracy: ', accuracy_score(y_test,predicted))
print('MultinomialNB Precision: ', precision_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB Recall: ', recall_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB f1-score: ', f1_score(y_test,predicted, average='binary', pos_label=0))
print('MultinomialNB AUC: ', auc(false_positive_rate, true_positive_rate))

print(f'confusion matrix:\n {confusion_matrix(y_test,predicted)}')
print('======================================\n')
print(classification_report(y_test,predicted,zero_division=0))

In [None]:
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(y_test,predicted)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()

In [None]:
#extracting true_positives, false_positives, true_negatives, false_negatives
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predicted).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_cleans["Ulasan"], data_cleans['sentiment_label'], test_size=0.2, random_state=42)

In [None]:
#Write  predictions to file
x_test = pd.DataFrame(x_test).reset_index().drop('index', axis = 1)
test_y = pd.DataFrame(y_test).reset_index().drop('index', axis = 1) #Actual values
yhat = pd.DataFrame(predicted) #predicted values using the model

# x_test.rename(columns= {0: "Test Data"}, inplace = True)
test_y.rename(columns= {0: 'Label Sentimen'}, inplace = True)
yhat.rename(columns = {0: 'Hasil Prediksi'}, inplace = True)

new = pd.concat([x_test, test_y, yhat], axis = 1)
#print(('ct'+ str(i)))
new.to_excel('Hasil_testing.xlsx', index = False)

In [None]:
x_test.head()

## SMOTE + Tomek Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
text_tf5 = tf.fit_transform(data_cleans['content'].astype('U'))
text_tf5

In [None]:
from imblearn.combine import SMOTETomek

x = text_tf5
y = data_cleans['sentiment_label']

print(x.shape)
print(y.shape)
print(y.value_counts())

smtom = SMOTETomek(sampling_strategy= 'minority')
x_smtom, y_smtom =smtom.fit_resample(x, y)

print(y_smtom.value_counts())

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

clf = MultinomialNB().fit(x_smtom,y_smtom.ravel())
predicted = clf.predict(x_test)
print('MultinomialNB Accuracy: ', accuracy_score(y_test,predicted))
print('MultinomialNB Precision: ', precision_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB Recall: ', recall_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB f1-score: ', f1_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB AUC: ', roc_auc_score(y_test,clf.predict_proba(x_test)[:,1]))

print(f'confusion matrix:\n {confusion_matrix(y_test,predicted)}')
print('======================================\n')
print(classification_report(y_test,predicted,zero_division=0))

## SMOTE + ENN Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
text_tf6 = tf.fit_transform(data_cleans['content'].astype('U'))
text_tf6

In [None]:
from imblearn.combine import SMOTEENN

x = text_tf6
y = data_cleans['sentiment_label']

print(x.shape)
print(y.shape)
print(y.value_counts())

smenn = SMOTEENN(sampling_strategy= 'minority')
x_smenn, y_smenn =smenn.fit_resample(x, y)

print(y_smenn.value_counts())

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

clf = MultinomialNB().fit(x_smenn,y_smenn.ravel())
predicted = clf.predict(x_test)
print('MultinomialNB Accuracy: ', accuracy_score(y_test,predicted))
print('MultinomialNB Precision: ', precision_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB Recall: ', recall_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB f1-score: ', f1_score(y_test,predicted, average='binary', pos_label="neg"))
print('MultinomialNB AUC: ', roc_auc_score(y_test,clf.predict_proba(x_test)[:,1]))

print(f'confusion matrix:\n {confusion_matrix(y_test,predicted)}')
print('======================================\n')
print(classification_report(y_test,predicted,zero_division=0))

## Visualisasi 

In [None]:
!pip install wordcloud

In [None]:
data_cleans['sentiment_label'].value_counts()

In [None]:
data_cleans.info()

In [None]:
from textblob import Word, TextBlob
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sentimen_pos = data_cleans[data_cleans['sentiment_label']==1]
sentimen_neg = data_cleans[data_cleans['sentiment_label']==0]
sentimen_pos.to_csv('sentimen_positif.csv', index=False)
sentimen_neg.to_csv('sentimen_negatif.csv', index=False)

In [None]:
import pandas as pd
visual_pos = pd.read_csv('sentimen_positif.csv', encoding='latin1')
visual_pos.head()

In [None]:
visual_neg = pd.read_csv('sentimen_negatif.csv', encoding='latin1')
visual_neg.head()

In [None]:
tf_pos = visual_pos['Ulasan'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()

tf_pos.columns = ["words", "tf"]
tf_pos.head()

In [None]:
tf_neg = visual_neg['Ulasan'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()

tf_neg.columns = ["words", "tf"]
tf_neg.head()

In [None]:
tf_pos[tf_pos["words"]=="good"]

In [None]:
tf_pos.shape, tf_neg.shape

In [None]:
tf_pos["words"].nunique(), tf_neg["words"].nunique()

In [None]:
tf_pos["tf"].describe([0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99]).T

In [None]:
tf_neg["tf"].describe([0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99]).T

In [None]:
# Barplot sentimen positif

tf_pos[tf_pos["tf"] > 200].plot.bar(x="words", y="tf")
plt.show()

In [None]:
# Barplot sentimen negatif

tf_neg[tf_neg["tf"] > 310].plot.bar(x="words", y="tf")
plt.show()

In [None]:
# Wordcloud sentimen positif

text = " ".join(i for i in visual_pos['Ulasan'])
wordcloud = WordCloud(max_font_size=50,
                      max_words=200,
                      background_color="black",width=250, height=250).generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('sentimen positif.png', format='png')
plt.show()

In [None]:
# Wordcloud sentimen negatif

text = " ".join(i for i in visual_neg['Ulasan'])
wordcloud = WordCloud(max_font_size=50,
                      max_words=200,
                      background_color="black",width=250, height=250).generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('sentimen negatif.png', format='png')
plt.show()