In [18]:
import pandas as pd  #Read Data

def read_data(file_dir):
    df = pd.read_csv(file_dir)
    return df

In [40]:
#Convert label to 0 or 1
def convert_label(df, col_name):
    index_nan = list()
    df2 = df[['Ulasan', col_name]].copy()
    for i, x in enumerate(df2[col_name]):
        if x == 'pos':
            df2[col_name][i] = 1
        elif x == 'neg':
            df2[col_name][i] = 0
        else:
            index_nan.append(i)
            
    for x in index_nan:    #Remove NaN Value
        df2 = df2.drop(x)
    
    return df2

In [41]:
import sklearn
from sklearn.model_selection import train_test_split

def split_data(df, col_name):
    words = df.Ulasan.tolist()
    label = df[col_name].tolist()
    x_train, x_test, y_train, y_test = train_test_split(words, label, test_size=0.1, random_state=10)
    return x_train, x_test, y_train, y_test

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorizer(x_train, x_test):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vecx_train = vectorizer.fit_transform(x_train).toarray()
    vecx_test = vectorizer.transform(x_test).toarray()
    return vecx_train, vecx_test, vectorizer

In [43]:
from sklearn.naive_bayes import MultinomialNB
from time import time

def train_data_nb(x_train, x_test, y_train, y_test):
    t0 = time()
    model = MultinomialNB()
    model.fit(x_train, y_train)
    print(f"\nTraining time: {round(time()-t0, 3)}s")
    t0 = time()
    score_train = model.score(x_train, y_train)
    print(f"Prediction time (train): {round(time()-t0, 3)}s")
    t0 = time()
    score_test = model.score(x_test, y_test)
    print(f"Prediction time (test): {round(time()-t0, 3)}s")
    print("\nTrain set score:", score_train)
    print("Test set score:", score_test)
    return model

In [44]:
from sklearn import svm

def train_data_svm(x_train, x_test, y_train, y_test):
    t0 = time()
    model = svm.SVC(kernel='linear')
    model.fit(x_train, y_train)
    print(f"\nTraining time: {round(time()-t0, 3)}s")
    t0 = time()
    score_train = model.score(x_train, y_train)
    print(f"Prediction time (train): {round(time()-t0, 3)}s")
    t0 = time()
    score_test = model.score(x_test, y_test)
    print(f"Prediction time (test): {round(time()-t0, 3)}s")
    print("\nTrain set score:", score_train)
    print("Test set score:", score_test)
    return model

In [45]:
def predict_data(text, model):
    data = [text]
    vec_data = vectorizer.transform(data).toarray()
    return model.predict(vec_data)

# Run Program!!!

In [49]:
#df = read_data("Data skripsi/fix hasil label/aspek_labeling_1.csv")
df = read_data("preprocessed_data.csv")

In [50]:
col_name = 'Errors'
df = convert_label(df, col_name)

In [51]:
x_train, x_test, y_train, y_test = split_data(df, col_name)

In [52]:
vx_train, vx_test, vectorizer = tfidf_vectorizer(x_train, x_test)

In [53]:
model_nb = train_data_nb(vx_train, vx_test, y_train, y_test)


Training time: 0.006s
Prediction time (train): 0.001s
Prediction time (test): 0.0s

Train set score: 0.9285714285714286
Test set score: 1.0


In [54]:
model_svm = train_data_svm(vx_train, vx_test, y_train, y_test)


Training time: 0.002s
Prediction time (train): 0.001s
Prediction time (test): 0.001s

Train set score: 1.0
Test set score: 1.0


In [57]:
text = "email"
predict_data(text, model_nb)

array([0])

In [59]:
text = "email"
predict_data(text, model_svm)

array([0])

In [77]:
#SOurce: https://towardsdatascience.com/training-a-naive-bayes-model-to-identify-the-author-of-an-email-or-document-17dc85fa630a