In [31]:
import csv
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import re

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

In [11]:
target = 'dataset-machine-learning-balanced-value.csv'
with open(target,'r',encoding='utf-8') as csvFile:
    csvReader = csv.reader(csvFile, delimiter=';')
    next(csvReader)
    words = []
    classes = []
    documents = []
    
    for row in csvReader:
        sentence = row[4].lower()
        sentence = re.sub(r"http\S+", "", sentence)
        sentence = re.sub(r"pic.twitter.com\S+", "",sentence)
        sentence = re.sub(r"[^a-zA-Z0-9]+", ' ', sentence)
        sentence = ' '.join([word for word in sentence.split() if word not in cachedStopWords])
        sentenceclass = row[3]
        cleanword = tokenizer.tokenize(sentence)
        words.extend(cleanword)
        documents.append((cleanword, sentenceclass))
        if sentenceclass not in classes:
            classes.append(sentenceclass)
        
    
    words = list(set(words))
    classes = list(set(classes))

In [12]:
print(len(documents))
print(classes)
print(documents[:1])
# print(words)

381
['1', '0', '2']
[(['fking', 'dumb', 'thinking', 'gojek', 'cheaper'], '0')]


### BOW Vector

In [13]:
training = []
output = []
outputclass = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    
    pattern_words = doc[0]
    
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)
    outputclass.extend(doc[1])

### Training and Testing

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from yellowbrick.classifier import ConfusionMatrix

a_train, a_test, b_train, b_test = train_test_split(training, outputclass, test_size=0.33, random_state=42)

In [15]:
# Neural Network model
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.66      0.55      0.60        49
           1       0.53      0.46      0.49        35
           2       0.42      0.55      0.47        42

    accuracy                           0.52       126
   macro avg       0.54      0.52      0.52       126
weighted avg       0.54      0.52      0.53       126

Result :  [0.57692308 0.5        0.53333333 0.61333333 0.26666667]
Mean :  0.498051282051282
Max :  0.6133333333333333


In [34]:
# Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())


#For Confution Matrix
# model = GaussianNB()
# cm = ConfusionMatrix(model)
# cm.fit(a_train, b_train)
# cm.score(a_test, b_test)
# cm.show()

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        49
           1       0.45      0.40      0.42        35
           2       0.43      0.48      0.45        42

    accuracy                           0.52       126
   macro avg       0.51      0.51      0.51       126
weighted avg       0.52      0.52      0.52       126

Result :  [0.61538462 0.43589744 0.45333333 0.53333333 0.36      ]
Mean :  0.47958974358974354
Max :  0.6153846153846154


In [17]:
#Decision Tree model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.64      0.61      0.62        49
           1       0.48      0.40      0.44        35
           2       0.52      0.62      0.57        42

    accuracy                           0.56       126
   macro avg       0.55      0.54      0.54       126
weighted avg       0.56      0.56      0.55       126

Result :  [0.35897436 0.52564103 0.62666667 0.53333333 0.12      ]
Mean :  0.43292307692307697
Max :  0.6266666666666667


In [18]:
#SVM model
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.62      0.53      0.57        49
           1       0.52      0.40      0.45        35
           2       0.47      0.64      0.55        42

    accuracy                           0.53       126
   macro avg       0.54      0.52      0.52       126
weighted avg       0.54      0.53      0.53       126

Result :  [0.46153846 0.53846154 0.54666667 0.65333333 0.05333333]
Mean :  0.45066666666666666
Max :  0.6533333333333333
