In [1]:
path = './zomato.csv'

In [2]:
import csv
import os
import glob
import numpy as np
import math
#scikit learn
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [3]:
def cleaning(sentence):
    sentence = sentence.replace('\ufeff','')
    sentence = sentence.replace('\n',' ')
    sentence = sentence.replace('\r',' ')
    sentence = sentence.replace('\n\n',' ')
    sentence = sentence.replace('\r\r',' ')
    sentence = sentence.replace('.',' ')
    sentence = sentence.replace(',',' ')
    sentence = sentence.replace('"',' ')
    sentence = sentence.replace('!',' ')
    sentence = sentence.replace('?',' ')
    sentence = sentence.replace('/',' ')
    sentence = sentence.replace('-',' ')
    sentence = sentence.lstrip()
    return sentence.lower()

In [4]:
def convert(sentences):
    import re
    cleanEmoticon = []
    for sentence in sentences:
        try:
            highpoints = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
        except re.error:
            highpoints = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    
        cleanEmoticon.append(highpoints.sub(u'\u25FD', sentence))
    return cleanEmoticon

In [6]:
x = []
y = []
with open(path, encoding="utf8") as a:
    b = csv.reader(a)
    for row in b:
        x.append(cleaning(row[2]))
        y.append(row[3])
x = np.array(x)
y = np.array(y)

In [7]:
randomize = np.arange(len(x))
np.random.shuffle(randomize)
x = x[randomize]
y = y[randomize]

In [8]:
x_train = x[:150]
y_train = y[:150]

x_test = x[150:]
y_test = y[150:]

In [9]:
def preprocess(sentences): 
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    result =[]
    for sentence in sentences:
        result.append(stemmer.stem(stopword.remove(sentence)))
    return result

In [10]:
# stem_test = ['dia biasanya makan nasi padang', 
#              'aku menyukainya dengan tulus dan raisa', 
#              'ku tak kan pernah memilikinya sampai kapanpun', 
#              'karena dia sudah memilikinya & dimilikinya']

x_train = preprocess(x_train)
x_test = preprocess(x_test)

In [11]:
x_train = convert(x_train)
x_test = convert(x_test)

### Scikit-Learn Tweaks
Reference:
https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [37]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

text_clf = Pipeline([
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('pca', PCA(n_components=128)),
                     ('clf', KNeighborsClassifier(n_neighbors=9)),
])
text_clf = text_clf.fit(x_train, y_train)

In [38]:
y_pred = text_clf.predict(x_test)

In [39]:
accuracy_score(y_test, y_pred)

0.7

In [18]:
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(x_train, y_train)

In [19]:
y_pred = text_clf.predict(x_test)

In [20]:
accuracy_score(y_test, y_pred) 

0.76

In [21]:
s_l = classification_report(y_test, y_pred,target_names=['0','1'])

In [22]:
print(s_l)

             precision    recall  f1-score   support

          0       0.71      0.88      0.79        25
          1       0.84      0.64      0.73        25

avg / total       0.78      0.76      0.76        50



In [23]:
confusion_matrix(y_test, y_pred)

array([[22,  3],
       [ 9, 16]], dtype=int64)

In [24]:
from sklearn.model_selection import KFold

x = np.concatenate((x_train,x_test))
y = np.concatenate((y_train,y_test))
        
randomize = np.arange(len(x))
np.random.shuffle(randomize)

acc = []
folds = 5
kf = KFold(n_splits=folds)
for train, test in kf.split(x):
    x_train, x_test, y_train, y_test = x[train],x[test],y[train],y[test]
    
    text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB()),
    ])
    text_clf = text_clf.fit(x_train, y_train)
    y_pred = text_clf.predict(x_test)
    acc.append(accuracy_score(y_test, y_pred))
print("Accuracy:",acc)
print("Mean Acc of",folds,"Fold:",np.mean(acc))

Accuracy: [0.825, 0.75, 0.775, 0.7, 0.725]
Mean Acc of 5 Fold: 0.755
