In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

#set warning 
import warnings
warnings.filterwarnings('ignore')


pd.pandas.set_option('display.max_columns', None)

In [None]:
import os
os.getcwd()

In [None]:
filename = "datasetEvadosenR2.csv"
df = pd.read_csv(filename, sep=';', encoding = 'utf8')
df.head()
     

In [None]:
import string
import re

In [None]:
def clean_text(text):
    return re.sub('[^a-zA-Z]', ' ', text).lower()
df['cleaned_text'] = df['komentar'].apply(lambda x: clean_text(x))
df['label'] = df['sentiment']

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100
df['Review_len'] = df['komentar'].apply(lambda x: len(x) - x.count(" "))
df['punct'] = df['komentar'].apply(lambda x: count_punct(x))
df.head()

In [None]:
def tokenize_text(text):
    tokenized_text = text.split()
    return tokenized_text
df['tokens'] = df['cleaned_text'].apply(lambda x: tokenize_text(x))
df.head()

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
all_stopwords = stopwords.words('indonesian','english')

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
def lemmatize_text(token_list):
    return " ".join([lemmatizer.lemmatize(token) for token in token_list if not token in set(all_stopwords)])

lemmatizer = nltk.stem.WordNetLemmatizer()
df['lemmatized_review'] = df['tokens'].apply(lambda x: lemmatize_text(x))
df.head(1105)

In [None]:
print(f"Input data has {len(df)} rows and {len(df.columns)} columns")
print(f"rating 1.0 = {len(df[df['sentiment']=='positif'])} rows")
print(f"rating 2.0 = {len(df[df['sentiment']=='negatif'])} rows")

In [None]:
print(f"Number of null in label: { df['sentiment'].isnull().sum() }")
sns.countplot(x='sentiment', data=df);

In [None]:
from wordcloud import WordCloud

In [None]:
df_negative = df[df['sentiment']=='negatif']
df_positive = df[df['sentiment']=='positif']
#convert to list
#df_negative['lemmatized_review']

negative_list=df_negative['lemmatized_review'].tolist()
positive_list= df_positive['lemmatized_review'].tolist()

filtered_negative = ("").join(str(negative_list)) #convert the list into a string of spam
filtered_negative = filtered_negative.lower()

filtered_positive = ("").join(str(positive_list)) #convert the list into a string of ham
filtered_positive = filtered_positive.lower()

In [None]:
wordcloud = WordCloud(max_font_size = 160, margin=0, background_color = "white", colormap="Greens").generate(filtered_positive)
plt.figure(figsize=[10,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.title("Positive Reviews Word Cloud")
plt.show()

In [None]:
wordcloud = WordCloud(max_font_size = 160, margin=0, background_color = "white", colormap="Reds").generate(filtered_negative)
plt.figure(figsize=[10,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.title("Negative Reviews Word Cloud")
plt.show()

In [None]:
X = df[['lemmatized_review', 'Review_len', 'punct']]
y = df['label']
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df = 0.5, min_df = 2) # ignore terms that occur in more than 50% documents and the ones that occur in less than 2
tfidf_train = tfidf.fit_transform(X_train['lemmatized_review'])
tfidf_test = tfidf.transform(X_test['lemmatized_review'])

X_train_vect = pd.concat([X_train[['Review_len', 'punct']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['Review_len', 'punct']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()  

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [26]:
print(X_train_vect)

     Review_len  punct    0    1    2    3    4    5         6    7    8    9  \
0            65    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
1            31    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
2           118    0.8  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
3           116    2.6  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
4            57    1.8  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
..          ...    ...  ...  ...  ...  ...  ...  ...       ...  ...  ...  ...   
872          69    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
873          97    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
874          91    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
875         241    0.8  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0   
876          83    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.425131  0.0  0.0  0.0   

      10   11   12        1

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

classifier.fit(X_train_vect, y_train)
#classifier.fit(y_train, X_train_vect)
naive_bayes_pred = classifier.predict(X_test_vect)

# Classification Report
print(classification_report(y_test, naive_bayes_pred))

# Confusion Matrix
class_label = ["negatif", "positif"]
df_cm = pd.DataFrame(confusion_matrix(y_test, naive_bayes_pred), index=class_label, columns=class_label)
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()