# Text Classification

In [1]:
# import
import pandas as pd
import os
import pickle
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re
from sklearn.model_selection import train_test_split
import random
import numpy as np
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier # to make ann
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [2]:
target_fscore = [0.3, 0.75]

datasetName = {'vanila'        : r'dataset_vanila_2.bin',
               'undersampling' : r'dataset_undersampling_2.bin',
               'oversampling'  : r'dataset_oversampling_2.bin'}

In [3]:
# recuring function


####
# create and save dataset to file
def save_to_file(file_name, obj,folder=r'%s/CleanDataset'%(os.getcwd())):
    """
    Will save object to a file
    object anticipated is [(x_train, y_train), (x_test, y_test)]
    
    
    @param 
     file_name : nama file
     folder    : path folder dataset
                 @default -> current dir + /CleanDataset
     obj       : object , variable
                 @anticipated [(x_train, y_train), (x_test, y_test)]
                 @note : will work for any object actually
    """
    with open(r'%s\%s' % (folder, file_name), 'wb') as output_ :
        pickle.dump(obj, output_)
    print(r'%s is saved' % (file_name))
    print(r'full_path : %s/%s' % (folder, file_name))

####
# load object to variable_ will read file
def load_from_file(full_path) :
    """
    Will load an object from file to a variable
    
    @variable
     data   [obj] : a temporary container for loaded data
    @param
     full_path : full path to file
                @example : D:/file.txt
    @return 
     data   [obj] : object that will be returned
    
    @process
      Directly read data from file in full_path, then load it to variable data
    """
    data = None
    with open(full_path, 'rb') as input_ :
        data = pickle.load(input_)
    return data

## Stage 0 - Data Loading

Tujuan dari bagian ini adalah untuk membaca data folder **Kelas Gereflekter** dan **Kelas Non Gereflekter**.
<br>Data akan disimpan pada file **raw_text.df** untuk obyek *dataframe*, dan **raw_text.csv** untuk obyek *csv* .
<br>Semua text disimpan dalam encoding *utf-8*.

In [4]:
# this cell for saving to csv & object using pickle


def read_file (f_list:list, enc='utf-8') -> list :
    # Membaca file dengan path pada f_list
    # Mengembalikan list dari content pada file yang dibaca
    # -> text dalam file akan dibaca dalam bentuk raw bytes
    # -> di convert menjadi string dengan encoding utf-8
    
    """
    Will read all files in the list
    
    @variable:
     container  [list] : will hold value from files
    
    @param:
     f_list [list] : a list that contain paths to txt file
     enc    [str]  : default text encoding
     
    @return:
     container  [list] : contain data from files
     
    @process:
     Create variable container [list]
     Iterating through every element in f_list,
     Every iteration of element from f_list :
      Append data from file with path named path to container
     
     Then return container
    """
    
    
    container = []
    for path in f_list :
        with open(path, mode='rb') as f :
            container.append(str(f.read(), encoding=enc))

    return container


if not (os.path.isfile('raw_text.df') and os.path.isfile('raw_text.csv')) :
    print('Running _hopefully \_(Machine Spirit)_/')
    path_ger = os.getcwd()+'\Gereflekter (2019)\Kelas Gereflekter'
    path_n = os.getcwd()+'\Gereflekter (2019)\Kelas Non Gereflekter'


    # menghasilkan list yang urut
    ger_file_list = [r'%s\GL-%d.txt' % (path_ger, f) for f in range(1, len(os.listdir(path_ger))+1)]
    non_file_list = [r'%s\NonGL-%d.txt' % (path_n, f) for f in range(1, len(os.listdir(path_n))+1)]


    ger_content = read_file(ger_file_list)
    non_ger_content = read_file(non_file_list)

    # label
    # ger -> 1
    # non_ger -> 0 
    df = pd.DataFrame({'text':ger_content+non_ger_content , 'label':[1]*len(ger_content) + [0]*len(non_file_list)})
    save_to_file(folder=r'%s' % (os.getcwd()), file_name='raw_text.df', obj=df)
    df.to_csv('raw_text.csv', sep=',', encoding='utf-8', index=False)

## Stage 1 - Prepocessing

In [5]:
# test case stopword from nltk
# looks good enough
stop_words = set(nltk.corpus.stopwords.words('indonesian'))
test_sentence  = 'Akan tetapi ternyata Mandor Bacan tetap tidak kapok untuk membalas dendam kepada Murtado. Mandor Bacan dan Bek Lihun kemudian mendatangkan tiga pembunuh bayaran dari daerah Kebayoran Lama. ketiga pembunuh bayaran itu bernama Boseh, Kepleng, dan Boneng. Mereka dibayar untuk membunuh Murtado.'

In [6]:
# case folding
def case_folding(sent:str) :
    # lower case
    sent = sent.lower()
    
    # remove delimiter
    delim =r',.!?/&-:;@\'.'
    sent = re.sub(r'[.?|,?|!?|/??|/?|&?|/-?|:?|;?|@?|\?]','',sent)
    return sent  

In [7]:
# tokenizing
def tokenize(sent:str) -> list :
    return nltk.tokenize.word_tokenize(sent)

In [8]:
# filtering
def filtering(token_list : list, stop_words:set) -> list:
    return [w for w in token_list if not w in stop_words]

In [9]:
# stemming
# menggunakan Sastrawi stemmer
def stemming(token_list: list) -> list :
    # create stemmer
    ps = StemmerFactory().create_stemmer()
    return [ps.stem(w) for w in token_list]


In [10]:
# read data to preprocess
#file_df = open('raw_text.df', 'rb') 
#object_df = pickle.load(file_df)
object_df = load_from_file(full_path=r'raw_text.df')
#object_df.iloc[0,0] = 'rq' # replace
#print(object_df.iloc[0,0]) # get data

dt_clean = []
dt_label = []

for i in range(0,object_df.shape[0]) :
    #print(object_df.iloc[i,0])
    
    # 1. case_folding 
    txt = case_folding(object_df.iloc[i,0])
    # 2. tokenize
    list_txt = tokenize(txt)
    # 3. filtering
    list_txt = filtering(list_txt, stop_words)
    # 4. Stemming
    list_txt = stemming(list_txt)

    
    # membersihkan dari '' string kosong
    list_txt = [w for w in list_txt if w != '']
    
    dt_clean.append(list_txt)
    dt_label.append(object_df.iloc[i,1])


# will export to file 
df = pd.DataFrame({'text':dt_clean , 'label':dt_label})

save_to_file(folder=r'%s' % (os.getcwd()), file_name='clean_data.df', obj=df)
df.to_csv('clean_data.csv', sep=',', encoding='utf-8', index=False)


clean_data.df is saved
full_path : C:\Users\link\Music\textClass/clean_data.df


In [11]:
print(df['label'].tolist().count(1))

14


## Stage 1.5 - Datasets Split

In [12]:
def split_dataset(feature, label, test_size:float ,method='vanilla') :
    """
    will split data, based on method chosen
    vanila, undersampling, oversampling
    like [(x_train, y_train), (x_test, y_test)] known as format_a
    both x_train and x_test is a list of string to make it easier to vectorize 
    
    @variable
    
    @param
     feature   [list] : 
     label     [list] : 
     method    [str]  : 
     test_size [float]: 
    
    @return
     [(x_train, y_train), (x_test, y_test)]
     
    @process
    
     return
    """
    
    def coupling(list_a, list_b) :
        """
        @Description
        
        
        
        
        @variable
         temp        [list] : 
         len_min     [int]  : 
         
        @param
         temp        [list] : 
         temp        [list] : 
         
        @return
         temp        [list] : 
         
        @process
        """
        
        
        temp = []
        len_min = min([len(list_a), len(list_b)])
        for i in range(0, len_min) :
            temp.append((list_a[i], list_b[i]))
        #temp2= [(x,y) for x in list_a for y in list_b]
        #print(len(temp), len(temp2))
        return temp
        
    def decouple(list_a) :
        list_b, list_c = [[],[]]
        for i in range(len(list_a)) :
            list_b.append(list_a[i][0])
            list_c.append(list_a[i][1])
            
        return list_b, list_c
    
    # bagian ini menyambung data word token menjadi str kembali di pisah dengan " "
    print(type(feature[0]))
    feature = [' '.join(elm) for elm in feature]
    print(type(feature[0]))
    
    
    if method.lower() == 'undersampling' :
        x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
        ger_count = y_train.count(1)
        ng_count = y_train.count(0)
        while ger_count > int(14*(1-test_size)) and ger_count < int(14*(1-test_size)) -1: 
            x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
            ger_count = y_train.count(1)
            ng_count = y_train.count(0)
#             print(ger_count, ng_count)
#         print(ger_count, ng_count)
        
        ##### undersampling part
        # will remove target amount of an ng element
        couple = coupling(x_train, y_train)
        # remove 20 % element to under sample
        target = 0.2
        count = int(ng_count * (target))
        
        #print(count)
        #print(len(couple))
        for i in range(count) :
            obj = random.choice(couple)
            while obj[1] == 1 :
                obj = random.choice(couple)
                
#             print(obj)
            couple.remove(obj)
        random.shuffle(couple)
        x_train, y_train = decouple(couple)
        # print((len(x_train), len(y_train)))
        ger_count = y_train.count(1)
        ng_count = y_train.count(0)
        # print(ger_count, ng_count)

        # end
        return [(x_train, y_train), (x_test, y_test)]
        
        
    elif method.lower() == 'oversampling' :
        x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
        ger_count = y_train.count(1)
        ng_count = y_train.count(0)
        while ger_count > int(14*(1-test_size)) and ger_count < int(14*(1-test_size)) -1: 
            x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
            ger_count = y_train.count(1)
            ng_count = y_train.count(0)
#             print(ger_count, ng_count)
            
        
        
        
        ##### oversampling part
        # will remove target amount of an ng element
        couple = coupling(x_train, y_train)
        # add 50 % element to under sample
        target = 0.5
        count = int(ger_count * (1+target))
        
#         print(count)
#         print(len(couple))
        for i in range(count) :
            obj = random.choice(couple)
            while obj[1] == 0 :
                obj = random.choice(couple)
                
#             print(obj)
            couple.append(obj)
#         print(couple[0])
        random.shuffle(couple)
#         print(couple[0])
        x_train, y_train = decouple(couple)
#         print((len(x_train), len(y_train)))
        ger_count = y_train.count(1)
        ng_count = y_train.count(0)
#         print(ger_count, ng_count)

        # end
        return [(x_train, y_train), (x_test, y_test)]
    else :
        x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
        ger_count = y_train.count(1)
        ng_count = y_train.count(0)
        while ger_count > int(14*0.8) and ger_count < int(14*0.8) -1: 
            x_train,x_test,y_train,y_test=train_test_split(feature,label,test_size=test_size)
            ger_count = y_train.count(1)
            ng_count = y_train.count(0)
        return [(x_train, y_train), (x_test, y_test)]


In [13]:
def create_data_set_object(df, test_size=0.2) :
    """
    Will create dictionary with dataset object with data structure
    like [(x_train, y_train), (x_test, y_test)] known as format_a
    both x_train and x_test is a list of string to make it easier to vectorize 
    
    @variable
     dataset_dict [dict]: dictionary to return
     method       [list]: list of method to use
     file_name    [str] : file_name template
     
    
    @param :
     df : data_frame of data set
         @expected format, dataframe with 2 column, label and text as column name
    
    @output :
     dataset_dict   : the object wil be returned
                      @format {'vanila'       : format_a satisfied object,
                               'undersampling' : format_a satisfied object,
                               'oversampling'  : format_a satisfied object}
    
    @process:
     The function will call split_dataset funtion with default parameter if none given,
     the function will be called three times for 'vanlla', 'undersampling' and 'oversampling' method.
     Each element in the dictionanry will be saved to a file. For backup purpose.
     return dataset_dict
    
    """
    
    dataset_dict = {}
    method = ['vanila', 'undersampling', 'oversampling']
    file_name = 'dataset_%s_%s.bin'
    for i,method_ in enumerate(method) :
        print('%d.%s' % (i+1, method_))
        dataset_dict[method_] = split_dataset(label=df['label'].tolist(), feature=df['text'].tolist(), test_size=test_size, method=method_)
        save_to_file(file_name=file_name % (method_ , str(int(10*test_size))), obj=dataset_dict[method_])

    
    return dataset_dict


def load_dataset(file_name) :
    
    
    path    = os.getcwd()+r'/CleanDataset/'+file_name
    dataset = load_from_file(path)
    return dataset




In [14]:
# check if dataset already created

dataset_path = os.getcwd() + r'\CleanDataset\\'
# datasetName = {'vanila'        : r'dataset_vanila_2.bin',
#                'undersampling' : r'dataset_undersampling_2.bin',
#                'oversampling'  : r'dataset_oversampling_2.bin'}

dataset_sts = True

for key,value in datasetName.items():
    if os.path.isfile(dataset_path+value) :
        dataset_sts = dataset_sts and True
    else : 
        dataset_sts = False

if not dataset_sts :
    s = create_data_set_object(df, test_size=0.2)

## Stage 2 - Learning

In [15]:
def sort_object(arr, best=0.3) :
    """
    Using bubble sort
    """
    leng = len(arr)
    for i in tqdm(range(0, leng)) :
        for j in range(0, leng-i-1) :
            if arr[j]['score']['fmeasure'] >  arr[j+1]['score']['fmeasure']:
#                 print(arr[j]['score']['fmeasure'],' > ' ,arr[j+1]['score']['fmeasure'])
                arr[j], arr[j+1] = arr[j+1], arr[j]
#     arr = arr[int(leng*best):]
    
    print('Result Len : ', len(arr))
    
    return arr


def get_score(prediction, actual) :
    
    #print(len(prediction), len(actual))
    # TP, FP
    # FN, TN
    confusion_matrix = [[0,0],[0,0]]
    for i in range(0, len(prediction)) :
        if actual[i] == 1 :
            #print('True_Value is 1')
            #print(actual[i],prediction[i])
            if prediction[i] == 1 :
                confusion_matrix[0][0] += 1
            else :
                confusion_matrix[0][1] += 1
        else :
            #print('True_Value is 0')
            #print(actual[i],prediction[i])
            if prediction[i] == 1 :
                confusion_matrix[1][0] += 1
            else :
                confusion_matrix[1][1] += 1
    
    
    # Ditambah 0.0001 untuk menghindari division by zero
    recall    = confusion_matrix[0][0] / ((confusion_matrix[0][0] + confusion_matrix[1][0])+0.0001)
    precision = confusion_matrix[0][0] / ((confusion_matrix[0][0] + confusion_matrix[0][1])+0.0001)
    
    fmeasure = 2*recall*precision/((recall+precision)+0.0001)
    #print(fmeasure)
    return {'conf_matrix' : confusion_matrix, 'fmeasure' : fmeasure, 'recall' : recall, 'precision' : precision}

# actual     = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0] 
# prediction = [1, 0, 0, 1, 0, 0, 1, 1, 1, 0] 
# xt = get_score(prediction,actual)
# print(xt)

In [16]:

def create_tree_classifier(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = DecisionTreeClassifier()
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf, dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf)
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}


def create_knn(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = KNeighborsClassifier()
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf, dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf)
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}

def create_discriminant_analysis(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = LinearDiscriminantAnalysis()
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf.toarray(), dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf.toarray())
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}

def create_NB(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = GaussianNB()
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf.toarray(), dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf.toarray())
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}

def create_mlp(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = MLPClassifier()
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf, dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf)
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}

def create_svc(dataset, paramet,  dataset_flavour) :
    #### parameter menffunakan semua nya
    ####
    #### [(x_train, y_train), (x_test, y_test)]
    ####
    
#     text_clf= Pipeline([('vect', CountVectorizer()),
#                         ('tfidf', TfidfTransformer()),
#                         ('clf', MultinomialNB()),])
    clf = SVC()
#     print(clf.get_params().keys())
    clf.set_params(**paramet)
    # vectorize
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset[0][0])

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    X_test_counts = count_vect.transform(dataset[1][0])
    #print('test size : ', len(dataset[1][0]))
    X_new_tfidf = tf_transformer.transform(X_test_counts)
    
    
    clf.fit(X_train_tf, dataset[0][1])
    
    predicted = clf.predict(X_new_tfidf)
    #print(len(predicted.tolist()))
    score = get_score(predicted.tolist(), dataset[1][1])
    
    
    return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
            'score' : score , 'dataset' : dataset_flavour,
            'count_vector' : count_vect, 'tf_transformer' : tf_transformer}




# def create_discriminant_analysis(dataset, paramet,  dataset_flavour) :
    
#     return {'model': clf, 'param':{'max_depth' : max_depth}, 'pred' : prediction_list,
#             'score' : score }

# def create_naive_bayes(dataset, paramet,  dataset_flavour) :
    
#     return {'model': clf, 'param':{'max_depth' : max_depth}, 'pred' : prediction_list,
#             'score' : score }


## Stage 3 - Validation and Optimization

In [17]:
# datasetName = {'vanila'        : r'dataset_vanila_2.bin',
#              'undersampling' : r'dataset_undersampling_2.bin',
#              'oversampling'  : r'dataset_oversampling_2.bin'}


def get_best_decisionTree(d_setName=datasetName) :
    # configurasi decision Tree

    # Sekitar 69 juta configurasi parameter jika semua parameter di test
    # dengan mengurangi parameter diatas telah cukup dekat dengan target
    # akan menggurangi kombinasi parameter
    # target ~ 2k

    # jumalh variasi 4k
    parameter_decisionTreeClass = {'criterion'                : ['gini', 'entropy']
                                   ,'splitter'                 : ['best', 'random'] 
                                   ,'max_depth'                : [None, 2,3,4,5]     
                                   ,'min_samples_split'        : [2,3,4,5,6]
    #                                ,'min_samples_leaf'         : [1,2,3,4,5] 
    #                                ,'min_weight_fraction_leaf' : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
                                   ,'max_features'             : [None, 'auto', 'sqrt', 'log2'] 
    #                                ,'random_state'             : [None, 3,4,5,6,7,8,9,10] 
    #                                ,'max_leaf_nodes'           : [None, 1,2,3,4,5,6,7,8,9,10] 
                                   ,'min_impurity_decrease'    : [0.0, 0.1, 0.2, 0.3, 0.4]
    #                                ,'min_impurity_split'       : [0]                                  # Deprecated 
                                   ,'class_weight'             : [None, 'balanced']
    #                                ,'presort'                  : ['deprecated']                       # Deprecated
    #                                ,'ccp_alpha'                : [0.0]
                                  } 

    list_param = ParameterGrid(parameter_decisionTreeClass)
    print("Jumlah variasi parameter decision tree : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    dec_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_tree_classifier(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > 0.4 :
                dec_list.append(temp)
                
    dec_list2 = sort_object(dec_list.copy(), best=0.3)
    return dec_list2



In [18]:
def get_best_Knn(d_setName=datasetName) :
    # configurasi Knn classifier
    # jumlah variasi 48
    parameter_knn                = {'n_neighbors'              : [3,4,5,6,7,8]
                                   ,'weights'                  : ['uniform', 'distance'] 
                                   ,'algorithm'                : ['auto','brute'] # tidak bisa tree untuk sparse matrix     
#                                    ,'leaf_size'                : [30,40,50]   # hanya untuk ball_tree, kd_tree
                                   ,'p'                        : [1,2] # tidak bisa menggunakan p > 2 untuk sparse matrix
#                                    ,'metric'                   : ['minkowski'] # hanya jika p > 2
#                                    ,'metric_params'            : [None] # jika metric != minkowski 
#                                    ,'n_jobs'                   : [1,2,3,4,5,6] # paralelization to speed it up
                                   } 

    list_param = ParameterGrid(parameter_knn)
    print("Jumlah variasi parameter KNN : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    knn_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_knn(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > 0.4 :
                knn_list.append(temp)
    knn_list2 = []
    knn_list2 = sort_object(knn_list.copy(), best=0.3)
    return knn_list2


In [19]:
def get_best_LDA(d_setName=datasetName) :
    # configurasi Linear Discriminant Analysis classifier
    # jumlah variasi ~~
    parameter_LDA                = {'solver'                : ['svd']
#                                     ,'shrinkage'             : ['auto', None, 0.1, 0.2] 
#                                    ,'priors'                : [None] 
                                   ,'n_components'          : [None]
                                   ,'store_covariance'      : [True, False]
                                   } 

    list_param = ParameterGrid(parameter_LDA)
    print("Jumlah variasi parameter Linear Discriminant Analysis : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    LDA_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_discriminant_analysis(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > -1:
                LDA_list.append(temp)
    LDA_list2 = []
    LDA_list2 = sort_object(LDA_list.copy(), best=0.3)
    return LDA_list2


In [20]:
def get_best_NB(d_setName=datasetName) :
    # configurasi GaussianNB classifier
    # jumlah variasi ~~
    parameter_clf                = {'priors' : [None]
                                   } 

    list_param = ParameterGrid(parameter_clf)
    print("Jumlah variasi parameter GaussianNB : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    clf_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_NB(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > 0.4 :
                clf_list.append(temp)
    clf_list2 = []
    clf_list2 = sort_object(clf_list.copy(), best=0.3)
    return clf_list
#MLPClassifier

In [21]:
def get_best_MLP(d_setName=datasetName) :
    # configurasi MLPClassifier classifier
    # jumlah variasi ~~
    parameter_clf                = {'hidden_layer_sizes' : [(4,), (5,), (3,)]
                                    ,'activation'        : ['identity', 'logistic', 'tanh', 'relu']
                                    ,'solver'            : ['lbfgs', 'sgd', 'adam']
                                    ,'alpha'             : [0.0001, 0.001, 0.01]
                                    ,'batch_size'        : ['auto']
                                    ,'learning_rate'     : ['constant', 'invscaling', 'adaptive']
                                    ,'learning_rate_init': [0.001]
                                    ,'power_t'           : [0.5]
#                                     ,'max_iter'          : [200, 150]
                                    ,'shuffle'           : [True]
                                    ,'random_state'      : [0]
                                    ,'verbose'           : [False]
                                    ,'warm_start'        : [False]
                                    ,'momentum'          : [0.9]
                                   } 

    list_param = ParameterGrid(parameter_clf)
    print("Jumlah variasi parameter MLPClassifier : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    clf_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_mlp(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > 0.4 :
                clf_list.append(temp)
    clf_list2 = []
    clf_list2 = sort_object(clf_list.copy(), best=0.3)
    return clf_list
#MLPClassifier

In [22]:
def get_best_svc(d_setName=datasetName) :
    # configurasi SV classifier
    # jumlah variasi ~~
    parameter_clf                = {'C'                       : [1.0,0.9, 0.8]
                                    ,'kernel'                 : ['linear', 'poly', 'rbf', 'sigmoid']
                                    ,'degree'                 : [2,3,4,5]
#                                     ,'gamma'                  : ['scale', 'auto']
#                                     ,'coef0'                  : [0]
#                                     ,'shrinking'              : [True, False]
#                                     ,'probability'            : [True, False]
#                                     ,'tol'                    : [0.001]
                                    ,'cache_size'             : [200]
                                    ,'class_weight'           : [None, 'balanced']
#                                     ,'verbose'                : [False]
#                                     ,'max_iter'               : [-1]
                                    ,'decision_function_shape': ['ovo', 'ovr']
#                                     ,'break_ties'             : [False, True]
                                    ,'random_state'           : [None]
                                   } 
#['C', 'cache_size', 'class_weight', 'coef0',
#'decision_function_shape',
#'degree', 'gamma', 'kernel', 'max_iter', 'probability',
#'random_state', 'shrinking', 'tol', 'verbose']
    list_param = ParameterGrid(parameter_clf)
    print("Jumlah variasi parameter MLPClassifier : ", len(list_param))
    print('Akan melakukan %d iterasi' % (len(list_param) * 3))

    # decision tree list
    clf_list = []

    for key, value in d_setName.items():
        print('Dataset : ' , key)
        print('FileName: ' , value)
        for idx, param in tqdm(enumerate(list_param)) :
            #print(param)
            temp = create_svc(paramet=param,dataset_flavour=key ,dataset=load_dataset(value))
            if temp['score']['fmeasure'] > 0.4 :
                clf_list.append(temp)
    clf_list2 = []
    clf_list2 = sort_object(clf_list.copy(), best=0.3)
    return clf_list

## Stage 4 - Selection

In [23]:
name = ['Decision Tree', 'Knn', 'Linear Discriminant Analysis', 'GaussianNB', 'MLP', 'SVC']

result = {}

def get_acceptable_value(target_fscore, list_object) :
    x = None
    acc = []
    for idx, val in enumerate(list_object) :
        if x != None :
            if val['score']['fmeasure'] > range[0] and val['score']['fmeasure'] < range[1] :
                x = val
                acc.append(val)
    
    if x == None :
        for key, val in enumerate(reversed(list_object)):
            if val['score']['fmeasure'] < 0.9799999999 :
                x = val
                acc.append(val)
    return acc[-1]


#def save_to_file(file_name, obj,folder=r'%s/CleanDataset'%(os.getcwd())):
for idx,nam in enumerate(name) :
    temp = []
    print('========================================START=======================================')
    print(idx, '  .  ', nam)
    if idx == 0: 
        temp = get_best_decisionTree()  
    elif idx == 1:
        temp = get_best_Knn()
    elif idx == 2:
        temp = get_best_LDA()
    elif idx == 3:
        temp = get_best_NB()
    elif idx == 4:
        temp = get_best_MLP()
    elif idx == 5:
        temp = get_best_svc()
    result[nam] = get_acceptable_value(target_fscore, temp)
    print('===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===')
    print('===                                                                              ===')
    print('===                              F_SCORE : %s                ===' % (result[nam]['score']['fmeasure'])) 
    print('===                                                                              ===')
    print('===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===')
    
save_to_file('best_model.bin', result, folder=r'%s' % (os.getcwd()))

0   .   Decision Tree
Jumlah variasi parameter decision tree :  4000
Akan melakukan 12000 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin


4000it [00:34, 117.25it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin


4000it [00:32, 123.44it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin


4000it [00:38, 104.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 438/438 [00:00<00:00, 14129.40it/s]


Result Len :  438
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.4137574340143662                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
1   .   Knn
Jumlah variasi parameter KNN :  48
Akan melakukan 144 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin


48it [00:00, 98.75it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin


48it [00:00, 105.49it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin


48it [00:00, 91.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 86/86 [00:00<00:00, 85944.76it/s]


Result Len :  86
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.49992500624968744                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
2   .   Linear Discriminant Analysis
Jumlah variasi parameter Linear Discriminant Analysis :  2
Akan melakukan 6 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin


2it [00:00, 11.43it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin


2it [00:00, 14.71it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin


2it [00:00,  9.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<?, ?it/s]


Result Len :  6
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.0                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
3   .   GaussianNB
Jumlah variasi parameter GaussianNB :  1
Akan melakukan 3 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin


1it [00:00, 47.61it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin


1it [00:00, 100.01it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin


1it [00:00, 71.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]


Result Len :  2
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.6665777829629381                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
4   .   MLP
Jumlah variasi parameter MLPClassifier :  324
Akan melakukan 972 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin














324it [00:34,  9.43it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin














324it [00:34,  9.38it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin














324it [00:35,  9.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 279/279 [00:00<00:00, 31012.93it/s]


Result Len :  279
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.4999468793941894                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
5   .   SVC
Jumlah variasi parameter MLPClassifier :  192
Akan melakukan 576 iterasi
Dataset :  vanila
FileName:  dataset_vanila_2.bin












192it [00:02, 84.03it/s]


Dataset :  undersampling
FileName:  dataset_undersampling_2.bin












192it [00:02, 92.66it/s]


Dataset :  oversampling
FileName:  dataset_oversampling_2.bin












192it [00:02, 73.65it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:00<?, ?it/s]


Result Len :  48
===++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++===
===                                                                              ===
===                              F_SCORE : 0.6665777829629381                           ===
===                                                                              ===
===++++++++++++++++++++++++++++++++++++++END+++++++++++++++++++++++++++++++++++++===
best_model.bin is saved
full_path : C:\Users\link\Music\textClass/best_model.bin


In [30]:
# cell ini untuk mengeexplorasi data yg di hasilkan

def predict_output(input_text:str,complete_info:dict) :
    
    
#         return {'model': clf, 'param':paramet, 'pred' : predicted.tolist(),
#             'score' : score , 'dataset' : dataset_flavour,
#             'count_vector' : count_vect, 'tf_transformer' : tf_transformer}
    
    # 1. case_folding 
    txt = case_folding(input_text)
    # 2. tokenize
    list_txt = tokenize(txt)
    # 3. filtering
    list_txt = filtering(list_txt, stop_words)
    # 4. Stemming
    list_txt = stemming(list_txt)

    
    # membersihkan dari '' string kosong
    list_txt = [w for w in list_txt if w != '']
    
    input_text = [r' '.join(list_txt)]
    print(input_text)
    sx = complete_info['count_vector'].transform(input_text)
    x_tif = complete_info['tf_transformer'].transform(sx)
    predicted = complete_info['model'].predict(x_tif)
    return predicted
    

    
inp_tst = 'pembunuh bayaran'
#inp_tst = 'saya makan apel'
#inp_tst = ' '.join(['selesai', 'layan', 'pesan', 'bakso', 'siti', 'lani', 'tukang', 'bakso', 'lanjut', 'jalan', 'siti', 'lani', 'perhati', 'gerobak', 'gerak', 'maju', 'dorong', 'tukang', 'bakso', 'sepeda', 'sepeda', 'gerak', 'maju', 'tuntun', 'dorong', 'kayuh', 'dorong', 'kayuh', 'gaya', 'sebab', 'gerobak', 'sepeda', 'gerak', 'tahu', 'gaya', 'gerak'])

print(predict_output(inp_tst, result['Decision Tree']))

['bunuh bayar']
[1]


In [None]:
# Masalah yg dapat diamati :
# 1 . Kalimat untuk data latih panjang, sehingga susah untuk di implementasikan pada kalimat biasa 
# 2 . Jika data di potong maka karakteristik tidak dapet terlihat
# 3 . Perbaikan dari code dengan mengimplementasikan Pipeline
#     Dengan penggunaan pipeline 40% kode dapat dihilangkan
# 4 . Program berjalan secara linear, kurang modular
# 5 . Tidak ada user interface
# 6 . Interaksi pada model hanya dapat dilakukan pada cell diatas
# 7 . Tidak semua model dapat di interaksi dengan cell diatas
#     Karena beberapa model meng antisipasi dense data , bukan sparse matrix
# 8 . 