In [3]:
import csv
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import re

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

In [4]:
target = 'dataset-machine-learning.csv'
with open(target,'r',encoding='utf-8') as csvFile:
    csvReader = csv.reader(csvFile, delimiter=',')
    next(csvReader)
    words = []
    classes = []
    documents = []
    
    for row in csvReader:
        sentence = row[4].lower()
        sentence = re.sub(r"http\S+", "", sentence)
        sentence = re.sub(r"pic.twitter.com\S+", "",sentence)
        sentence = re.sub(r"[^a-zA-Z0-9]+", ' ', sentence)
        sentence = ' '.join([word for word in sentence.split() if word not in cachedStopWords])
        sentenceclass = row[3]
        cleanword = tokenizer.tokenize(sentence)
        words.extend(cleanword)
        documents.append((cleanword, sentenceclass))
        if sentenceclass not in classes:
            classes.append(sentenceclass)
        
    
    words = list(set(words))
    classes = list(set(classes))

In [5]:
print(len(documents))
print(classes)
print(documents[:1])
# print(words)

2848
['2', '1', '0']
[(['asap', 'gojek', 'datang', 'malaysia'], '1')]


### BOW Vector

In [6]:
training = []
output = []
outputclass = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    
    pattern_words = doc[0]
    
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)
    outputclass.extend(doc[1])

### Training and Testing

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

a_train, a_test, b_train, b_test = train_test_split(training, outputclass, test_size=0.33, random_state=42)

In [8]:
# Neural Network model
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.42      0.10      0.16        49
           1       0.88      0.89      0.89       673
           2       0.60      0.68      0.64       218

    accuracy                           0.80       940
   macro avg       0.63      0.56      0.56       940
weighted avg       0.79      0.80      0.79       940

Result :  [0.61996497 0.72631579 0.82425308 0.65905097 0.6344464 ]
Mean :  0.6928062405142585
Max :  0.8242530755711776


In [9]:
# Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.29      0.18      0.23        49
           1       0.77      0.47      0.58       673
           2       0.26      0.60      0.36       218

    accuracy                           0.48       940
   macro avg       0.44      0.42      0.39       940
weighted avg       0.63      0.48      0.51       940

Result :  [0.46059545 0.38070175 0.13181019 0.51318102 0.63620387]
Mean :  0.4244984560114039
Max :  0.6362038664323374


In [10]:
#Decision Tree model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.33      0.10      0.16        49
           1       0.86      0.85      0.86       673
           2       0.54      0.64      0.59       218

    accuracy                           0.76       940
   macro avg       0.58      0.53      0.53       940
weighted avg       0.76      0.76      0.76       940

Result :  [0.55341506 0.66315789 0.79964851 0.66080844 0.58699473]
Mean :  0.6528049251257192
Max :  0.7996485061511424


In [11]:
#SVM model
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(a_train,b_train)

predictions = model.predict(a_test)
print(classification_report(b_test,predictions))

scores = cross_val_score(model, training, outputclass, cv=5, scoring='accuracy')
print("Result : ", scores)
print("Mean : ", scores.mean())
print("Max : ", scores.max())

              precision    recall  f1-score   support

           0       0.44      0.14      0.22        49
           1       0.88      0.89      0.88       673
           2       0.60      0.65      0.62       218

    accuracy                           0.80       940
   macro avg       0.64      0.56      0.57       940
weighted avg       0.79      0.80      0.79       940

Result :  [0.61120841 0.7245614  0.82073814 0.70826011 0.55711775]
Mean :  0.6843771605567247
Max :  0.820738137082601
