# Import Library

In [1]:
import numpy as np  
import six
import re  
import os
import sys
import nltk  
import pandas
import pickle  
import csv
import math
import time
import matplotlib.pyplot as plt 

import warnings
import numbers
import scipy.sparse as sp

from Lib.Contractions import CONTRACTION_MAP
from Lib.ProgressBar import ProgressBar

from scipy.sparse import csr_matrix
from sklearn.datasets import load_files
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
from itertools import zip_longest
from textblob import TextBlob
from collections import Counter

from math import log
from distutils.version import LooseVersion
from inspect import signature
from numpy.core.numeric import ComplexWarning
from scipy.sparse import issparse
from scipy.special import digamma

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

# Load Dataset

In [2]:
'''
================================================================================================
                                P R E - P R O C E S S I N G
================================================================================================

''' 

class Load_Data():
    
    def make_corpus(self, root_dir):
        polarity_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]    
        corpus = []    
        for polarity_dir in polarity_dirs:
            reviews = [os.path.join(polarity_dir,f) for f in os.listdir(polarity_dir)]
            for review in reviews:
                doc_string = "";
                with open(review) as rev:
                    for line in rev:
                        doc_string = doc_string + line
                if not corpus:
                    corpus = [doc_string]
                else:
                    corpus.append(doc_string)

        labels = np.zeros(2000)
        labels[0:1000] = 0
        labels[1000:2000] = 1
        
        print("--- LOAD DATASET DONE \u2713 ----")

        return corpus, labels
    
    def __init__(self):
        print("----------------------------------------------------")
        print("               L O A D   D A T A S E T              ")
        print("----------------------------------------------------")
        
root_dir = 'Data/txt_sentoken'
data_train_x, data_train_y = Load_Data().make_corpus(root_dir)

----------------------------------------------------
               L O A D   D A T A S E T              
----------------------------------------------------
--- LOAD DATASET DONE ✓ ----


# Preprocessing - All Function

In [21]:
'''
================================================================================================
                                P R E - P R O C E S S I N G
================================================================================================

''' 

class Preprocessing():
    
    def remove_prefix_b(self, data):
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', data)
        return document
    
    def expand_contractions(self, data, contraction_mapping=CONTRACTION_MAP):
    
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                    if contraction_mapping.get(match)\
                                    else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, data)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def remove_special_characters(self, data, remove_digits=False):
        pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
        document = re.sub(pattern, '', data)
        return document
    
    def spelling_correction(self, data, status):
        if status == False :
            return data
        blob = TextBlob(data)
        document = blob.correct()
        return document
    
    def remove_single_characters(self, data):
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', data)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
        return document
    
    def remove_multiplespace(self, data):
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', data, flags=re.I)
        return document
    
    def split_document(self, data):
        document = data.split()
        return document
    
    def convert_to_lowercase(self, data):
        # Converting to Lowercase
        document = data.lower()
        return document
    
    def stop_words_removal(self, data, status):
        if status == False :
            return data
        
        stop_words = ['the', 'and', 'of', 'is', 'to', 'in', 'it', 'that', 'as', 'not', 'with', 
                       'for', 'his', 'this', 'film', 'he', 'but', 'are', 'on', 'by', 'be', 'have', 
                       'an', 'who', 'one', 'movie', 'you', 'was', 'from', 'at']

        document = [word for word in data if not word in stop_words]
        return document

    def remove_punctuation(self, data):
        document = re.sub(r'[^\w\s]','', data)
        return document  
    
    def stemming(self, data, status):
        if status == False :
            return data
        # stemmer = LancasterStemmer()
        # stemmer = PorterStemmer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        document = [stemmer.stem(word) for word in data]
        return document
    
    def lemmatization(self, data, status):
        if status == False :
            return data
        # Lemmatization
        lemma = WordNetLemmatizer()
        document = [lemma.lemmatize(word) for word in data]
        return document
    
    def join_word(self, data):
        document = [word for word in data]
        document = ' '.join(document)
        return document
    
    def get_result(self):
        print("--- PREPROCESSING DONE \u2713 ---")
        return self.result
    
    def __init__(self, stopword, stem, x, y=None):
        print("----------------------------------------------------")
        print("             P R E - P R O C E S S I N G            ")
        print("----------------------------------------------------")
        data_train_x, data_train_y = x, y
        documents_cleaned = []
        
        progress = ProgressBar(len(data_train_x), fmt=ProgressBar.FULL)
        for index in range(progress.total):
            progress.current += 1
            progress()
            
            prep_prefix_b = self.remove_prefix_b(data_train_x[index])
            prep_contractions = self.expand_contractions(prep_prefix_b)
            prep_special_char = self.remove_special_characters(prep_contractions, remove_digits=True)
            prep_single_char = self.remove_single_characters(prep_special_char)
            prep_multiplespace = self.remove_multiplespace(prep_single_char)
            prep_lowercase = self.convert_to_lowercase(prep_multiplespace)
            prep_spell_correction = self.spelling_correction(prep_lowercase, status=False)
            prep_punctuation = self.remove_punctuation(prep_spell_correction)
            prep_split_data = self.split_document(prep_punctuation)
            prep_stopword = self.stop_words_removal(prep_split_data, stopword)
            prep_stemming = self.stemming(prep_stopword, stem)
            prep_join_word = self.join_word(prep_stemming)
            documents_cleaned.append(prep_join_word)
        progress.done()  
        
        if len(y) == 0:
            self.result = documents_cleaned
        else:
            self.result = {}
            self.result['review'] = documents_cleaned
            self.result['class'] = data_train_y
  
        
'''
##  Test Function Preprocessing  ##
'''
# documents_cleaned = Preprocessing(stopword = True, stem = True, x=data_train_x, y=data_train_y).get_result()

'\n##  Test Function Preprocessing  ##\n'

# Preprocessing - Stopword

In [56]:
from sklearn.feature_extraction.text import CountVectorizer 

class Stopword():
    
    def get_top_n_words(self, corpus, n=None):
        vec = CountVectorizer().fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
        words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
        return words_freq[:n]


    def stop_words_removal(self, data, stop_words):
        new_document = []
        for index in range(len(data)):
            document = data[index].split()
            document = [word for word in document if not word in stop_words]
            document = [word for word in document]
            document = ' '.join(document)
            new_document.append(document)
        return new_document

    def SVM(self, new_document, documents_cleaned):
        kf = StratifiedKFold(n_splits=10)

        totalsvm = 0
        totalMatSvm = np.zeros((2,2))

        corpus = new_document
        labels = documents_cleaned['class']

        for train_index, test_index in kf.split(corpus, labels):
            X_train = [corpus[i] for i in train_index]
            X_test = [corpus[i] for i in test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
            train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
            test_corpus_tf_idf = vectorizer.transform(X_test)

            model1 = LinearSVC()
            model1.fit(train_corpus_tf_idf,y_train)
            result1 = model1.predict(test_corpus_tf_idf)

            totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
            totalsvm = totalsvm+sum(y_test==result1)


        tn, fp, fn, tp = totalMatSvm.ravel()
        acc = (tp + tn)/2000
        f1 = 2*tp / (2*tp + fp + fn)
        print (" --> Akurasi SVM : {0}".format(acc))
        return acc
    
    def check_max_accuracy(self, list_stopword, list_num, stopword_acc):
        max_acc = max(stopword_acc)
        for idx in range(len(list_num)):
            if max_acc == stopword_acc[idx]:
                print("\n--- Best Stopword ---")
                print("Jumlah Stopword:", list_num[idx], end=" ")
                print(", Akurasi :", max_acc)
                print("List Word :")
                print(list_stopword[idx])
                break

'''
##  Test Stopword  ##
'''

documents_cleaned = Preprocessing(stopword = False, stem = False, x=data_train_x, y=data_train_y).get_result()
list_num = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]

list_stopword = []
stopword_acc = []

print("--- Result Training Stopword --")
for num in list_num:
    print('stopword =',num, end=" ")
    common_words = Stopword().get_top_n_words(documents_cleaned['review'], num)
    
    stopword = []
    for word, freq in common_words:
        stopword.append(word)
    list_stopword.append(stopword)
    
    new_document = Stopword().stop_words_removal(documents_cleaned['review'], stopword)
    
    stopword_acc.append(Stopword().SVM(new_document, documents_cleaned))
    
print("Training Stopword Done \u2713 ")
Stopword().check_max_accuracy(list_stopword, list_num, stopword_acc)

----------------------------------------------------
             P R E - P R O C E S S I N G            
----------------------------------------------------




--- Result Test Stopword --
stopword = 0  --> Akurasi SVM : 0.884
stopword = 5  --> Akurasi SVM : 0.883
stopword = 10  --> Akurasi SVM : 0.8835
stopword = 15  --> Akurasi SVM : 0.884
stopword = 20  --> Akurasi SVM : 0.8835
stopword = 25  --> Akurasi SVM : 0.8835
stopword = 30  --> Akurasi SVM : 0.8845
stopword = 35  --> Akurasi SVM : 0.8815
stopword = 40  --> Akurasi SVM : 0.8815
stopword = 45  --> Akurasi SVM : 0.8815
stopword = 50  --> Akurasi SVM : 0.882
stopword = 55  --> Akurasi SVM : 0.882
stopword = 60  --> Akurasi SVM : 0.883
stopword = 65  --> Akurasi SVM : 0.88
stopword = 70  --> Akurasi SVM : 0.88
stopword = 75  --> Akurasi SVM : 0.878
stopword = 80  --> Akurasi SVM : 0.879
stopword = 85  --> Akurasi SVM : 0.878
stopword = 90  --> Akurasi SVM : 0.8735
stopword = 95  --> Akurasi SVM : 0.8775
stopword = 100  --> Akurasi SVM : 0.876

--- Best Stopword ---
Jumlah Stopword: 30 , Akurasi : 0.8845
List Word :
['the', 'and', 'of', 'is', 'to', 'in', 'it', 'that', 'as', 'not', 'with',

## Preprocessing with Stopword

In [8]:
documents_cleaned = Preprocessing(stopword = True, stem = False, x=data_train_x, y=data_train_y).get_result()

----------------------------------------------------
             P R E - P R O C E S S I N G            
----------------------------------------------------




--- PREPROCESSING DONE ✓ ---




# Split Dataset

In [9]:

'''
================================================================================================
                                    S P L I T     D A T A
================================================================================================

'''

class Split_Data():
        
    def sort_score_feature(self, X, y):
        X = np.array(X)
        y = np.array(y)
        idx = np.argsort(y)
        return X[idx], y[idx]
    
    def do_split_data(self, data_cleaned, n):
        kf = StratifiedKFold(n_splits=n, shuffle=False, random_state=None)

        corpus = data_cleaned['review']
        labels = data_cleaned['class']
        
        k=1
        for train_index, test_index in kf.split(corpus, labels):
            X_train = [corpus[i] for i in train_index]
            X_test = [corpus[i] for i in test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            self.output_split_data(X_train, y_train, k, 'train/datatrain_seleksi_fitur')
            self.output_split_data(X_test, y_test, k, 'test/datatest_seleksi_fitur')
            k += 1
        print("--- SPLIT DATASET DONE \u2713 ---")

    def output_split_data(self, review, class_score, k, filename):
        filepath = os.getcwd()+'/Data/Preprocessing/' +filename+str(k)+'.csv'
        raw_data = {'Review': review,
                   'Class': class_score}
        df = pandas.DataFrame(raw_data, columns = ['Review', 'Class'])
        df.to_csv(filepath, index=False)
            
    def __init__(self, data_cleaned, n):
        print("----------------------------------------------------")
        print("              S P L I T    D A T A S E T            ")
        print("----------------------------------------------------")  

        self.do_split_data(data_cleaned, n)

        
'''
----------------------------------------------------
            Test Function Split Data
----------------------------------------------------
'''
Split_Data(documents_cleaned, 10)

----------------------------------------------------
              S P L I T    D A T A S E T            
----------------------------------------------------
--- SPLIT DATASET DONE ✓ ---


<__main__.Split_Data at 0x25bc078d7f0>

# Feature Selection

In [10]:
'''
================================================================================================
                            F E A T U R E       S E L E C T I O N
================================================================================================

'''

class Feature_Selection():
    
    def input_vocabulary(self, data):
        print('# Input Vocabulary #')
        vocabulary = []
        
        progress = ProgressBar(len(data['review']), fmt=ProgressBar.FULL)
        for index in range(progress.total):
            progress.current += 1
            progress()
            for word in data['review'][index]:
                if word not in vocabulary:
                    vocabulary.append(word)
        progress.done()
        return vocabulary

    def num_class(self, feature_data, data):
        print('# num_class #')
        feature_data['num_class_1'] = 0
        feature_data['num_class_0'] = 0
        for class_val in data['class']:
            if(class_val == 0):
                feature_data['num_class_0'] += 1
            else:
                feature_data['num_class_1'] += 1
        return feature_data
    
    def sort_score_feature(self, feature_data):
        list1, list2 = zip(*sorted(zip(feature_data['score_feature'], feature_data['vocabulary']), reverse=True))
      
        feature_data['score_feature'] = list1
        feature_data['vocabulary'] = list2

        return feature_data
    
    def information_gain(self, feature_data, data, k):
        print("# information_gain #")
        prob_0_total = feature_data['num_class_0'] / len(data['review'])
        prob_1_total = feature_data['num_class_1'] / len(data['review'])
        log_0_total = math.log(prob_0_total, 2)
        log_1_total = math.log(prob_1_total, 2)
        entropy_total = -(prob_1_total * log_1_total) -(prob_0_total * log_0_total)
        print(entropy_total)
        
        feature_data['score_feature'] = []
        
        progress = ProgressBar(len(feature_data['vocabulary']), fmt=ProgressBar.FULL)
        for index in range(progress.total):
            progress.current += 1
            progress()
            
            vocabulary = feature_data['vocabulary'][index]
            
            #Entropy Value            
            s_value = set([text.count(vocabulary) for text in data['review']])            
            sigma_v = 0

            for value in s_value:
                nol, satu, both = 0, 0, 0
                for i in range(0, len(data['review'])):
                    if((data['class'][i] == 0) and (data['review'][i].count(vocabulary) == value)):
                        nol += 1
                    if((data['class'][i] == 1) and (data['review'][i].count(vocabulary) == value)):
                        satu += 1
                    if(data['review'][i].count(vocabulary) == value):
                        both += 1
                prob_0_value = 0
                if(nol > 0 and both > 0):
                    prob_0_value = nol / both
                prob_1_value = 0
                if(satu > 0 and both > 0):
                    prob_1_value = satu / both

                log_0_value = 0
                log_1_value = 0
                if(prob_0_value > 0):
                    log_0_value = math.log(prob_0_value, 2)
                if(prob_1_value > 0):
                    log_1_value = math.log(prob_1_value, 2)
                    
                entropy_value = (both/len(data['review'])) * (-(prob_1_value * log_1_value) -(prob_0_value * log_0_value))
                sigma_v += entropy_value

            gain_level = entropy_total - sigma_v
            feature_data['score_feature'].append(gain_level)

        progress.done()
        feature_data = self.sort_score_feature(feature_data)
        return feature_data
        
    def mutual_information(self, feature_data, data):
        print("# mutual_information #")
        
        data_review = []
        
        for i in range(0,len(data['review'])):
            documents = data['review'][i]
            document = [word for word in documents]
            document = ' '.join(document)
            data_review.append(document)
        
        cv = CountVectorizer()
        X_vec = cv.fit_transform(data_review)
        Y = data['class']
        
        # Get MI Score
    
        X = X_vec.asformat('csc')
        y = np.ravel(Y)
        
        columns = range(X.shape[1])
        iterate_column = []
        for i in columns:
            x = np.zeros(X.shape[0])
            start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
            x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
            iterate_column.append(x)

        mi = []
        
        for x in iterate_column:            
            labels_true, labels_pred = np.asarray(x), np.asarray(y)
            #  classes = jumlah kemunculan fitur {1,3,6} , class_idx = urutan kemunculan classes {0, 1, 2}
            classes, class_idx = np.unique(labels_true, return_inverse=True)            
            #  clusters = kelas fitur {0, 1} , cluster_idx = urutan kemunculan kelas {1, 1,...0, 0}
            clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)

            # Tabel untuk menghitung total kemunculan setiap nilai fitur di Kelas 0 dan 1
            contingency = sp.coo_matrix((np.ones(class_idx.shape[0]),
                                        (class_idx, cluster_idx)),
                                        shape=(classes.shape[0], clusters.shape[0]),
                                        dtype=np.int)

            contingency = contingency.tocsr()
            contingency.sum_duplicates()

            if sp.issparse(contingency):
                # Tabel contingency dipisah mejadi 3: nilai fitur, kelas, dan total nilai fitur terhadap kelas
                nzx, nzy, nz_val = sp.find(contingency)

            contingency_sum = contingency.sum()
            
            # pi = Probabilitas kemunculan setiap nilai Fitur, pj = Probabilitas kemunculan kelas
            pi = np.ravel(contingency.sum(axis=1))
            pj = np.ravel(contingency.sum(axis=0))            
            log_contingency_nm = np.log(nz_val)
            
            # Probabilitas nilai fitur terhadap kelas dibagi total data
            contingency_nm = nz_val / contingency_sum
            
            outer = (pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(np.int64, copy=False))            
            log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
            
            mi_score = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
                contingency_nm * log_outer)
            mi.append(mi_score.sum())

        res = dict(zip(cv.get_feature_names(), np.array(mi)))
        
        columns = cv.get_feature_names()
        
        feature = {}
        feature['vocabulary'] = []
        feature['score'] = []
        for i in range(len(res)):
            feature['vocabulary'].append(columns[i])
            feature['score'].append(res.get(columns[i]))
            
        feature_data['vocabulary'] = feature['vocabulary']
        feature_data['score_feature'] = feature['score']
        
        feature_data = self.sort_score_feature(feature_data)
        return feature_data
    
    def output_data_feature_selection(self, feature_data, filename, k):
        filepath = os.getcwd()+'/Data/Data_Feature_Selection/'+filename+str(k)+'.csv'
        # filepath = os.getcwd()+'/drive/My Drive/Colab Notebooks/New_Data/Data_Feature_Selection/'+filename+str(k)+'.csv'
        raw_data = {'Feature': feature_data['vocabulary'],
                   'Score': feature_data['score_feature']}
        df = pandas.DataFrame(raw_data, columns = ['Feature', 'Score'])
        df.to_csv(filepath, index=False)

    def load_data_set_split(self, iteration):
        data_review = []
        data_class = []
        filename = os.getcwd()+'/Data/Preprocessing/train/datatrain_seleksi_fitur'+str(iteration)+'.csv'
        # filename = os.getcwd()+'/drive/My Drive/Colab Notebooks/New_Data/Preprocessing/train/datatrain_seleksi_fitur'+str(iteration)+'.csv'
        names = ['Review', 'Class']
        data = pandas.read_csv(filename, names=names, header=None, skiprows=1)
        dataset_review = data.Review
        for review in dataset_review:
            review_split = review.split()
            bag_word = []
            for word in review_split:
                bag_word.append(word)
            data_review.append(bag_word)
        data_class.extend(data.Class)
            
        return data_review, data_class

    def __init__(self, feature_selection):
        print("----------------------------------------------------")
        print("        F E A T U R E       S E L E C T I O N       ")
        print("----------------------------------------------------")    

        for i in range(1,11):
            data_review, data_class = self.load_data_set_split(i)

            data_set_split = {}
            data_set_split['review'] = data_review
            data_set_split['class'] = data_class
            
            print("K-",i," Jumlah Data : ", len(data_review))
            
            feature_data = {}
            
            if feature_selection == "IG":
                feature_data['vocabulary'] = self.input_vocabulary(data_set_split)
                feature_data.update(self.num_class(feature_data, data_set_split))
                feature_data.update(self.information_gain(feature_data, data_set_split, i))
                filename = 'Information_Gain/hasil_seleksi_fitur_'
            else:
                feature_data.update(self.mutual_information(feature_data, data_set_split))
                filename = 'Mutual_Information/hasil_seleksi_fitur_'
            
            self.output_data_feature_selection(feature_data, filename, i)
        print("--- FEATURE SELECTION DONE \u2713 ---")
       
'''
----------------------------------------------------
            Test Function Feature Selection
----------------------------------------------------
'''
#feature_selection = IG / MI
Feature_Selection(feature_selection="MI")

----------------------------------------------------
        F E A T U R E       S E L E C T I O N       
----------------------------------------------------
K- 1  Jumlah Data :  1800
# mutual_information #
K- 2  Jumlah Data :  1800
# mutual_information #
K- 3  Jumlah Data :  1800
# mutual_information #
K- 4  Jumlah Data :  1800
# mutual_information #
K- 5  Jumlah Data :  1800
# mutual_information #
K- 6  Jumlah Data :  1800
# mutual_information #
K- 7  Jumlah Data :  1800
# mutual_information #
K- 8  Jumlah Data :  1800
# mutual_information #
K- 9  Jumlah Data :  1800
# mutual_information #
K- 10  Jumlah Data :  1800
# mutual_information #
--- FEATURE SELECTION DONE ✓ ---


<__main__.Feature_Selection at 0x25bc078d978>

# Classification

In [16]:


'''
================================================================================================
                                C L A S S I F I C A T I O N
================================================================================================

'''

class Classification():
        
    def load_dataset(self, filename):
        data_review = []
        data_class =  []     
        filepath = os.getcwd()+'/Data/Preprocessing/'+filename+'.csv'
        names = ['Review', 'Class']
        data = pandas.read_csv(filepath, names=names, header=None, skiprows=1)
        for review in data.Review:
            data_review.append(review)
        for label in data.Class:
            data_class.append(label)

        return data_review, data_class
    
    def select_best_feature(self, type, feature_vocab, feature_score, max_features, threshold):
        new_features = []
        for i in range(len(feature_vocab)):
            vocabulary = feature_vocab[i]
            if type=='Threshold':
                if feature_score[i] >= threshold:
                    new_features.append(vocabulary)
            else :
                if len(new_features) < max_features:
                    new_features.append(vocabulary)
                        
        return new_features
    
    def make_file_output(self, selection_feature, n):
        filename = "Output_Classification_{0}_{1}".format(selection_feature, n)
        filepath = os.getcwd()+'/Data/Classification_Result/'+filename+'.txt'
        text_file = open(filepath, "w")
        text_file.write("")
        text_file.close()
        
    def output_to_text(self, data, selection_feature, n):
        filename = "Output_Classification_{0}_{1}".format(selection_feature, n)
        filepath = os.getcwd()+'/Data/Classification_Result/'+filename+'.txt'
        text_file = open(filepath, "a+")
        text_file.write(data)
        text_file.close()
        
    def save_object(self, obj, filename):
#         filepath = os.getcwd()+'/drive/Colab Notebooks/Data/Classification/'+filename+'.pkl'
        filepath = os.getcwd()+'/Data/Classification_Result/Pickle/'+filename+'.pkl'
        with open(filepath, 'wb') as output:  # Overwrites any existing file.
            pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
    
    def load_object(self, filename):
#         filepath = os.getcwd()+'/drive/Colab Notebooks/Data/Classification/'+filename+'.pkl'
        filepath = os.getcwd()+'/Data/Classification_Result/Pickle/'+filename+'.pkl'
        with open(filepath, 'rb') as input:
            obj = pickle.load(input)
            return obj
    
    
    def classification_training(self, preprocessing, feature_selection, threshold, max_features, c_value, gamma_value, degree_value, kernel):
    
        totalsvm = 0
        totalDataset = 0
        totalMatSvm = np.zeros((2,2))
        
        list_accuracy = []
        
        metrics = {}
        metrics['accuracy_score'] = 0
        metrics['precision_score'] = 0
        metrics['recall_score'] = 0 
        metrics['f1_score'] = 0   
        
        TFIDF_Model = []
        SVM_Model = []
        
        self.make_file_output(feature_selection, round(max_features*100))
        
        text_to_save = ""
        text_to_save += "\n=============================\n"
        text_to_save += "Preprocessing: {0}, Fitur Seleksi: {1}, Max Feature: {2}%".format(preprocessing, feature_selection, round(max_features*100))
        text_to_save += "\n--- K-FOLD CROSS VALIDATION ---"
        
        print(text_to_save)
        self.output_to_text(text_to_save, feature_selection, round(max_features*100))
    
        for index in range(1,11):
            filename = 'train/datatrain_seleksi_fitur'+str(index)
            X_train, train_y = self.load_dataset(filename)
                    
            filename = 'test/datatest_seleksi_fitur'+str(index)
            X_test, test_y = self.load_dataset(filename)
            
            totalDataset = len(X_train)+len(X_test)
            
            text_to_save = "\n"
            text_to_save += "----  K-Fold: {0} ----".format(index)
            print(text_to_save)
            self.output_to_text(text_to_save, feature_selection, round(max_features*100))
            
            if feature_selection != 'No' :
                 # Load Dataset IG Result
                if feature_selection == "IG":
                    filename = 'Information_Gain/hasil_seleksi_fitur_'
                else:
                    filename = 'Mutual_Information/hasil_seleksi_fitur_'
                filepath = os.getcwd()+'/Data/Data_Feature_Selection/'+filename+str(index)+'.csv'
                names = ['Feature', 'Score']
                data = pandas.read_csv(filepath, names=names, header=None, skiprows=1, delimiter=",")
                feature_vocab = data.Feature 
                feature_score = data.Score
                
                # Select Best Feature from All Feature 
                # type = 'Threshold' / 'Num Feature'
                best_features = self.select_best_feature('Num Feature', feature_vocab, feature_score, round(max_features*len(feature_vocab)), threshold=threshold)

                text_to_save = "\n"
                text_to_save += "Features : {0}, Used : {1}".format(len(feature_vocab), len(best_features))
                print(text_to_save)
                self.output_to_text(text_to_save, feature_selection, round(max_features*100))
                
                tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True, vocabulary=best_features)            
                train_X = tfidf.fit_transform(X_train)
                
                TFIDF_Model.append(tfidf)
            
                test_X = tfidf.transform(X_test)
                        
            else:
              
                tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True)            
                train_X = tfidf.fit_transform(X_train)
                
                TFIDF_Model.append(tfidf)
                
                test_X = tfidf.transform(X_test)
            
            # Train Classifier
            if kernel == 'rbf':
                svmClf = SVC(kernel='rbf', gamma=gamma_value, C=c_value)
            elif kernel == 'linear':
                svmClf = LinearSVC(random_state=0, dual=False)
            elif kernel == 'poly':
                svmClf = SVC(kernel='poly', degree=degree_value, C=c_value)
                
            svmClf.fit(train_X, train_y)
            
            SVM_Model.append(svmClf)
        
            resultSVM = svmClf.predict(test_X)            
            totalsvm = totalsvm+sum(test_y==resultSVM)
            
            text_to_save = "\n"
            acc = accuracy_score(test_y, resultSVM)
            text_to_save += "Accuracy SVM : {0}".format(acc)
            list_accuracy.append(acc)
            
            print(text_to_save)
            self.output_to_text(text_to_save, feature_selection, round(max_features*100))
            
            "-- Confusion Matrix --"
            totalMatSvm += confusion_matrix(test_y, resultSVM)
        
        "-- Metrics Score --"
        tn, fp, fn, tp = totalMatSvm.ravel()
        metrics['accuracy_score'] = (tp + tn)/totalDataset
        metrics['precision_score'] = tp/(tp + fp)
        metrics['recall_score'] = tp / (tp + fn)
        metrics['f1_score'] = 2*tp / (2*tp + fp + fn)
        
        text_to_save = "\n"
        text_to_save += "=============================\n"
        text_to_save += "Parameter : C={0}, Gamma={1}, Degree={2}\n".format(c_value, gamma_value, degree_value)
        text_to_save += "Jumlah Data : {0}\n".format(totalDataset)
        text_to_save += "Akurasi SVM : {0}\n".format(totalsvm/totalDataset*100)
        text_to_save += "Metrics : {0}".format(metrics)
        
        print(text_to_save)
        self.output_to_text(text_to_save, feature_selection, round(max_features*100))
        
        best_acc = max(list_accuracy)
        for idx in range(len(list_accuracy)):
            if best_acc == list_accuracy[idx]:
                filename = "TF-IDF/TF-IDF_Train_{0}_N_{1}".format(feature_selection, round(max_features*100))
                self.save_object(TFIDF_Model[idx], filename)
                filename = "SVM/SVM_{0}_Classifier_N_{1}".format(feature_selection, round(max_features*100))
                self.save_object(SVM_Model[idx], filename)
                break
                
    
    def get_result(self):
        return self.results['accuracy']
                
    def __init__ (self, preprocessing, feature_selection, threshold, max_features, c_variant, gamma_variant, degree_variant, kernel):
        print("----------------------------------------------------")
        print("         S V M   C L A S S I F I C A T I O N        ")
        print("----------------------------------------------------")  
        
        self.results= {}
        self.results['accuracy'] = []
        self.results['kernel'] = []
        self.results['c_value'] = []
        self.results['gamma'] = []
        self.results['degree'] = []
        self.results['max_features'] = []
        
        best_kfold = 0
        
        if kernel == 'rbf':
            for c_value in c_variant:
                for gamma_value in gamma_variant:
                    self.classification_training(preprocessing, feature_selection, threshold, max_features, c_value=c_value, gamma_value=gamma_value, degree_value=3, kernel=kernel)
        elif kernel == 'linear':
            gamma='scale'
            for c_value in c_variant:
                self.classification_training(preprocessing, feature_selection, threshold, max_features, c_value=c_value, gamma_value=gamma, degree_value=3, kernel=kernel)
        elif kernel == 'poly':
            for degree_value in degree_variant:
                for c_value in c_variant:
                    self.classification_training(preprocessing, feature_selection, threshold, max_features, c_value=c_value, gamma_value=0, degree_value=degree_value, kernel=kernel)
        
        

'''
##  Test Function Classification  ##
'''

'\n##  Test Function Classification  ##\n'

### Test SVM Classification

In [17]:
list_max_features = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for max_features in list_max_features:
    result = Classification(
            preprocessing = 'Stopword',
            feature_selection = 'MI',
            threshold = 1,
            max_features = max_features,
            degree_variant = [1],
            c_variant = [1],
            gamma_variant = [1],
            kernel = 'linear'
        ).get_result()

----------------------------------------------------
         S V M   C L A S S I F I C A T I O N        
----------------------------------------------------

Preprocessing: Stopword, Fitur Seleksi: MI, Max Feature: 10%
--- K-FOLD CROSS VALIDATION ---

----  K-Fold: 1 ----

Features : 44700, Used : 4470

Accuracy SVM : 0.865

----  K-Fold: 2 ----

Features : 44833, Used : 4483

Accuracy SVM : 0.865

----  K-Fold: 3 ----

Features : 44754, Used : 4475

Accuracy SVM : 0.88

----  K-Fold: 4 ----

Features : 45072, Used : 4507

Accuracy SVM : 0.855

----  K-Fold: 5 ----

Features : 44868, Used : 4487

Accuracy SVM : 0.855

----  K-Fold: 6 ----

Features : 44659, Used : 4466

Accuracy SVM : 0.9

----  K-Fold: 7 ----

Features : 44466, Used : 4447

Accuracy SVM : 0.88

----  K-Fold: 8 ----

Features : 44644, Used : 4464

Accuracy SVM : 0.865

----  K-Fold: 9 ----

Features : 44781, Used : 4478

Accuracy SVM : 0.845

----  K-Fold: 10 ----

Features : 44625, Used : 4462

Accuracy SVM : 0.905



Accuracy SVM : 0.88

----  K-Fold: 5 ----

Features : 44868, Used : 31408

Accuracy SVM : 0.87

----  K-Fold: 6 ----

Features : 44659, Used : 31261

Accuracy SVM : 0.9

----  K-Fold: 7 ----

Features : 44466, Used : 31126

Accuracy SVM : 0.895

----  K-Fold: 8 ----

Features : 44644, Used : 31251

Accuracy SVM : 0.905

----  K-Fold: 9 ----

Features : 44781, Used : 31347

Accuracy SVM : 0.86

----  K-Fold: 10 ----

Features : 44625, Used : 31237

Accuracy SVM : 0.925

Parameter : C=1, Gamma=scale, Degree=3
Jumlah Data : 2000
Akurasi SVM : 88.9
Metrics : {'accuracy_score': 0.889, 'precision_score': 0.8851485148514852, 'recall_score': 0.894, 'f1_score': 0.8895522388059701}
----------------------------------------------------
         S V M   C L A S S I F I C A T I O N        
----------------------------------------------------

Preprocessing: Stopword, Fitur Seleksi: MI, Max Feature: 80%
--- K-FOLD CROSS VALIDATION ---

----  K-Fold: 1 ----

Features : 44700, Used : 35760

Accuracy S

# Classification by User Input

In [24]:
class Classification_Input():
    
    def load_object(self, filename):
        filepath = os.getcwd()+'/Data/Classification_Result/Pickle/'+filename+'.pkl'
        with open(filepath, 'rb') as input:
            obj = pickle.load(input)
            return obj

    def do_classification(self, X_test, feature_selection, best_num_feature):
        print("----------------------------------------------------")
        print("         S V M   C L A S S I F I C A T I O N        ")
        print("----------------------------------------------------")  
        
        # Load TF-IDF Model
        filename = "TF-IDF/TF-IDF_Train_{0}_N_{1}".format(feature_selection, best_num_feature)
        tfidf = self.load_object(filename)
        test_X = tfidf.transform(X_test)
        
        # Load SVM Classifier Model
        filename = "SVM/SVM_{0}_Classifier_N_{1}".format(feature_selection, best_num_feature)
        svmClf = self.load_object(filename)
        
        resultSVM = svmClf.predict(test_X)
        
        if resultSVM == 1:
            print("Hasil Sentimen : [+] Positif")
        else:
            print("Hasil Sentimen : [-] Negatif")

    def input_data(self, feature_selection, best_num_feature):
        data_input = input("Enter your data: ")
        new_data = [data_input]
        data_cleaned = Preprocessing(stopword = True, stem = False, x=new_data, y=[]).get_result()
        self.do_classification(data_cleaned, feature_selection, best_num_feature)
        
    def __init__ (self, feature_selection, best_num_feature):
        # SVM Classification By Input
        
        self.input_data(feature_selection, best_num_feature)

Classification_Input("MI", 80)

Enter your data: a secret society so powerful it can get away with murder . a secret society so exclusive it firebrands everyone who joins with its mark . a secret society so secret . . . it has a big logo up on top of the building ! ? you know something is rotten with the skulls right from the get-go . i mean , what self-respecting prep school-ivy league snob would join an organization with a name as stupid as " the skulls " ? well , luke ( joshua jackson ) would be , for one . only he's no preppie . he's a " townie " with no money , but even though he's of the lower classes , since he's such a good rower ( yes , " the skulls , " i get it ) , he's a shoo-in for the secret society . a mysterious invitation arrives , and luke is whisked into a world of power and money , where men in red robes usher in beautiful women for the taking at tuxedoed parties . before you can utter " fidelio , " luke has become one of them . luke is soon partnered with a " soul mate " ( not making that up ) , c



<__main__.Classification_Input at 0x25bc4fa0dd8>