In [2]:
import pandas as pd  #Read Data

def read_data(file_dir):
    df = pd.read_csv(file_dir)
    return df

In [3]:
#Convert label to 0 or 1
def convert_label(df):
    index_nan = list()
    df2 = df[['Ulasan','Satisfaction']].copy()
    for i, x in enumerate(df2.Satisfaction):
        if x == 'pos':
            df2.Satisfaction[i] = 1
        elif x == 'neg':
            df2.Satisfaction[i] = 0
        else:
            index_nan.append(i)
            
    for x in index_nan:    #Remove NaN Value
        df2 = df2.drop(x)
    
    return df2

In [4]:
import sklearn
from sklearn.model_selection import train_test_split

def split_data(df):
    words = df.Ulasan.tolist()
    label = df.Satisfaction.tolist()
    x_train, x_test, y_train, y_test = train_test_split(words, label, test_size=0.1, random_state=10)
    return x_train, x_test, y_train, y_test

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorizer(x_train, x_test):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vecx_train = vectorizer.fit_transform(x_train).toarray()
    vecx_test = vectorizer.transform(x_test).toarray()
    return vecx_train, vecx_test, vectorizer

In [6]:
from sklearn.naive_bayes import MultinomialNB
from time import time

def train_data_nb(x_train, x_test, y_train, y_test):
    t0 = time()
    model = MultinomialNB()
    model.fit(x_train, y_train)
    print(f"\nTraining time: {round(time()-t0, 3)}s")
    t0 = time()
    score_train = model.score(x_train, y_train)
    print(f"Prediction time (train): {round(time()-t0, 3)}s")
    t0 = time()
    score_test = model.score(x_test, y_test)
    print(f"Prediction time (test): {round(time()-t0, 3)}s")
    print("\nTrain set score:", score_train)
    print("Test set score:", score_test)
    return model

In [7]:
from sklearn import svm

def train_data_svm(x_train, x_test, y_train, y_test):
    t0 = time()
    model = svm.SVC(kernel='linear')
    model.fit(x_train, y_train)
    print(f"\nTraining time: {round(time()-t0, 3)}s")
    t0 = time()
    score_train = model.score(x_train, y_train)
    print(f"Prediction time (train): {round(time()-t0, 3)}s")
    t0 = time()
    score_test = model.score(x_test, y_test)
    print(f"Prediction time (test): {round(time()-t0, 3)}s")
    print("\nTrain set score:", score_train)
    print("Test set score:", score_test)
    return model

In [8]:
def predict_data(text, model):
    data = [text]
    vec_data = vectorizer.transform(data).toarray()
    return model.predict(vec_data)

# Run Program!!!

In [9]:
#df = read_data("Data skripsi/fix hasil label/aspek_labeling_1.csv")
df = read_data("preprocessed_data.csv")

In [10]:
df = convert_label(df)

In [11]:
x_train, x_test, y_train, y_test = split_data(df)

In [12]:
vx_train, vx_test, vectorizer = tfidf_vectorizer(x_train, x_test)

In [14]:
model_nb = train_data_nb(vx_train, vx_test, y_train, y_test)


Training time: 0.003s
Prediction time (train): 0.0s
Prediction time (test): 0.0s

Train set score: 0.9318181818181818
Test set score: 0.7


In [15]:
model_svm = train_data_svm(vx_train, vx_test, y_train, y_test)


Training time: 0.016s
Prediction time (train): 0.01s
Prediction time (test): 0.001s

Train set score: 1.0
Test set score: 0.7


In [16]:
text = "tambah lagi dong fitur ketersediaan"
predict_data(text, model_nb)

array([0])

In [17]:
text = "tambah lagi dong fitur ketersediaan"
predict_data(text, model_svm)

array([1])

In [77]:
#SOurce: https://towardsdatascience.com/training-a-naive-bayes-model-to-identify-the-author-of-an-email-or-document-17dc85fa630a