In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import nltk


In [None]:
label_positive = 1 
label_negative = 0

In [None]:
df = pd.read_csv(r'C:\Users\Kanzul Faisal\Documents\Proposal PI\Project PI\PreProcessing.csv')
df.head()

In [None]:
df = df.drop(df.columns[[0, 4]], axis=1)
df.head()

In [None]:
# Split into train and test data

train_X, test_X, train_Y, test_Y = model_selection.train_test_split(df['Komentar_Final'], df['Label'], test_size = 0.2,random_state = 20)
# random_state = 20 menyatakan kita akan mendapatkan output yang sama dengan saat pertama kali membuat pemisahan.

In [None]:
df_train = pd.DataFrame()
df_train['Sentiment'] = train_X
df_train['Label'] = train_Y

df_test = pd.DataFrame()
df_test['Sentiment'] = test_X
df_test['Label'] = test_Y

In [None]:
def convert(label):
  if label == 'positif':
    return 1
  else:
    return 0

df_train['Label'] = train_Y.apply(convert)
df_test['Label'] = test_Y.apply(convert)

In [None]:
# TF-IDF
tfidf_vect = TfidfVectorizer(max_features = 5000)
# tfidf_vect.fit(df_train['Sentiment'])
tfidf_vect.fit(df['Komentar_Final'])



In [None]:
tfidf_vect

In [None]:
print()
print("selected words as feature : ")
print("----------------------------")
print(tfidf_vect.get_feature_names())
print()

In [None]:
# You can use the below syntax to see the vocabulary that it has learned from the corpus
print(tfidf_vect.vocabulary_)

In [None]:
print("jumlah data training : ")
print(len(train_X))
print()

print("jumlah data test : ") 
print(len(test_X))
print() 

In [None]:
train_X_tfidf = tfidf_vect.transform(df_train['Sentiment'])
test_X_tfidf = tfidf_vect.transform(df_test['Sentiment'])

In [None]:
# tfidf_mat = tfidf_vect.transform(df['Komentar_Final']).toarray()
tfidf_mat = tfidf_vect.transform(df_train['Sentiment']).toarray()
# tfidf_mat = tfidf_vect.transform(df_test['Sentiment']).toarray()

terms = tfidf_vect.get_feature_names()

# menjumlahkan tfidf dari tiap kata/term di semua dataset
sums = tfidf_mat.sum(axis=0)

# menampilkan jumlah tfidf dari tiap kata yang ada di dataset
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','TF-IDF'])


In [None]:
ranking_br=ranking.sort_values('TF-IDF', ascending=False)
print(ranking_br)

In [None]:
ranking_br.to_csv(r'C:\Users\Kanzul Faisal\Documents\Proposal PI\Project PI\tfidf_train.csv')

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(train_X_tfidf,df_train['Label'])

In [None]:
predict = model.predict(test_X_tfidf)
acc = (accuracy_score(df_test['Label'],predict))*100

print(round(acc,2),'%')

In [None]:
conf_matrix = confusion_matrix(df_test['Label'], predict)
print("Confusion Matrix : ") 
print(conf_matrix)

In [None]:
import seaborn as sns
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(df_test['Label'], predict), annot=True, fmt=".0f",ax=ax)
plt.xlabel("predict")
plt.ylabel("actual")
plt.savefig(r"C:\Users\Kanzul Faisal\Documents\Proposal PI\Project PI\visualisasi data\confusion matrix.png")
plt.show()

In [None]:
from sklearn.metrics import classification_report
# Accuracy, Precision, Recall, f1-score
print ("\nHere is the classification report:") 
print (classification_report(df_test['Label'], predict))

In [None]:
# menghitung jumlah label positif dan negatif pada data test setelah hasil prediksi model
test_after_nb_count_label = collections.Counter(predict)
juml_pos_nb= test_after_nb_count_label[label_positive]
juml_neg_nb = test_after_nb_count_label[label_negative]

In [None]:
#pie chart analisis sentimen
labels = ['Positive','Negative']
Category = [juml_pos_nb, juml_neg_nb]
fig, ax = plt.subplots()
color = ['blue', 'red']
plt.pie(Category, labels=labels, colors=color,startangle=90, shadow=True, autopct='%1.2f%%', explode=(0.1, 0))
plt.title('Diagram Lingkar Data Hasil Prediksi Klasifikasi Naive Bayes')
plt.legend()
plt.savefig(r"C:\Users\Kanzul Faisal\Documents\Proposal PI\Project PI\visualisasi data\pie_nb.png")
plt.show()


In [None]:
kamus_kata = pd.read_csv(r'C:\Users\Kanzul Faisal\Documents\Proposal PI\Project PI\tfidf_train.csv')
kamus_kata.head(15)

In [None]:
del kamus_kata['Unnamed: 0']
kamus_kata.head(15)

In [None]:
import pickle
pickle.dump(model,
            open('model_nb.pkl', 'wb'),
            protocol=4)

In [None]:
pickle.dump(tfidf_vect,
            open('tfidf.pkl', 'wb'),
            protocol=4)