In [1]:
import numpy as np
import pandas as pd
import scorecardpy as sc
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from googletrans import Translator, LANGUAGES
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from googletrans import Translator, LANGUAGES
from imblearn.combine import SMOTEENN
np.seterr(divide = 'ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
class preprocessing:
    def __init__(self):
        def normalisasi(norm):
            dt = {}
            for i in range(len(norm)):
                dt[norm.loc[i,"kata typo"]] = norm.loc[i,"kata normal"]
            return dt
        self.__kamusNorm = normalisasi(pd.read_csv("./data/components/word-normalization.csv", delimiter=";"))
        self.__stopword = list(pd.read_csv("./data/components/stopword.csv",header=None).loc[:,0])

    def token(self,text:list):
        output = []
        for i in range(len(text)):
            output.append(text[i].split(" "))
        return output
    
    def stem(self,text: list):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        output = []
        # stemming process
        for i in range(len(text)):
            sentence = text[i]
            is_list = False
            if isinstance(sentence, list):
                sentence = " ".join(sentence)
                is_list = True
            temp   = stemmer.stem(sentence)
            if is_list:
                temp = temp.split(" ")
            output.append(temp)
        return output

    def filtering(self, text: list):
        output = []
        for i in range(len(text)):
            temp = []
            for j in range(len(text[i])):
                if text[i][j] not in self.__stopword:
                    temp.append(text[i][j])
            output.append(temp)
        return output
    
    def removeLink(self, data:list):
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        for i in range(len(data)):
            data[i] = re.sub(regex, " ", data[i])
        return data
    
    def normalization(self,data):
        length_data = np.array(data).shape
        for i in range(length_data[0]):
            is_str= isinstance(data[i], str)
            temp = data[i]
            if is_str:
                temp = data[i].split(" ")
                
            for j in range(len(temp)):
                temp[j] = self.__kamusNorm.get(temp[j],temp[j])
            if is_str:
                temp = " ".join(temp)
            data[i] = temp
        return data
    
    def prep_data(self,data):
        data = self.removeLink(data)
        translate_data = self.translate(data)
        token_data = self.token(translate_data)
        filter_data = self.filtering(token_data)
        stem_data = self.stem(filter_data)
        return stem_data
    
    def getTerm(self, text:list):
        output=[]
        for i in range(len(text)):
            for j in range(len(text[i])):
                if text[i][j] not in output:
                    output.append(text[i][j])
        return output
    
    def translate(self,text:list):
        trans = Translator(service_urls=['translate.google.com',"translate.google.co.id"])
        output = []
        for i in range(len(text)):           
            sentence = text[i]
            is_arr = isinstance(sentence, list)
            if is_arr:
                sentence = " ".join(sentence)
            temp = trans.translate(sentence, dest='id').text
            if is_arr:
                temp = temp.split(" ")
            output.append(temp)
        return output

In [3]:
class extraction:
    def __init__(self):
        self.__data = []
        
    def getTF(self,text:list, term:list, tfWeight=True):
        output = []
        if not (isinstance(text, list) or isinstance(text, np.array)):
            raise " "
        for i in range(len(text)):
            doc = text[i]
            if isinstance(text[i], list):
                doc = " ".join(doc)
            tf_doc = []
            for j in range(len(term)):
                temp = doc.count(term[j])
                tf_doc.append(temp)
            output.append(tf_doc)
        output = np.array(output)
        #w_tf=1+log(tf) tf>0, tf=0: 0
        if tfWeight:
            output = np.where(output>0,1+np.log10(output),0)
        return output

    def getIDF(self,tf_doc:list):
        # makan | minum | tidur
        # 1     | 0     | 1      
        # 0     | 0     | 0
        countNZero = np.count_nonzero(tf_doc, axis=0)
        n_doc = tf_doc.shape[0]
        return np.where(countNZero != 0 ,np.log10(n_doc/countNZero),0)

    def __normalized_vector(self,tfidf, train=False):
        def get_length(tfidf):
            pow_tfidf = np.power(tfidf,2)
            sum_tfidf = np.sum(pow_tfidf, axis=1)
            return np.sqrt(sum_tfidf)
        length_tfidf = get_length(tfidf)
        norm = tfidf.T/np.where(length_tfidf==0,1,length_tfidf)
        where_are_NaNs = np.isnan(norm)
        norm[where_are_NaNs] = 0
        return norm.T
    
    def getTFIDF(self,tf, idf):
        tfidf = tf*idf
        return self.__normalized_vector(tfidf,True)

class knn:
    def __init__(self):
        self.__data = []
        self.__target = []
    
    def fit(self, x:list,y:list):
        if x.ndim!=2:
            raise "Error data Train is not 2dimention"
        self.__data = x
        self.__target = y
    
    def __cossim(self, tfidf_train, tfidf_test):
        if tfidf_test.ndim == 2:
            res = []
            for i in range(tfidf_test.shape[0]):
                res.append(np.dot(tfidf_train, tfidf_test[i]))
            return np.array(res)
        return np.array([np.dot(tfidf_train, tfidf_test)])
    
    def predict(self, x_norm:list, k:int=3, norm=True):
        if len(self.__data)==0:
            raise "Error please input data train"
        result = self.__cossim(self.__data, x_norm)
        target = []
        for i in range(len(result)):
            hasil = sorted(zip(result[i],self.__target), key=lambda pair: pair[0], reverse=True)[:k]
            hsl = dict(Counter([j for i,j in hasil]))
            if hsl.get(1,0) > hsl.get(-1,0):
                target.append(1)
            elif hsl.get(1,0) < hsl.get(-1,0):
                target.append(-1)
            elif hsl.get("1",0) > hsl.get("-1",0):
                target.append(1)
            elif hsl.get("1",0) < hsl.get("-1",0):
                target.append(-1)
            else:
                target.append(0)
        return target

In [4]:
def getIVnew(tfidf, term,target):
    df = pd.DataFrame(tfidf,columns=term)
    df = df.assign(sentimen_i = target)
    info_val = sc.iv(df, y="sentimen_i")
    return info_val
    
def feature_selection(term, iv, select = 0.02):
    feature_selections = iv.variable[iv.info_value>=select]
    index_feature = []
    for i in feature_selections:
        terms = term.index(i)
        index_feature.append(terms)
    return index_feature

In [5]:
prep = preprocessing()

In [8]:
# data_read = pd.read_csv("./data/tests/test.csv", delimiter=";")
# data_read_train = pd.read_csv("./data/tests/train.csv", delimiter=";")
# data_read_test = pd.read_csv("./data/tests/test.csv", delimiter=";")
data_read_train = pd.read_csv("./data/tests/tests-combine-train/train_data_kecepatan.csv", delimiter=";")
data_train = data_read_train

In [9]:
data_read_test = pd.read_csv("./data/tests/tests-combine-train/test_data_kecepatan.csv", delimiter=";")
data_test = data_read_test

In [10]:
result_prep_train = prep.prep_data(list(data_train.tweet))
result_prep_test = prep.prep_data(list(data_test.tweet))

In [11]:
term_train = prep.getTerm(result_prep_train)
term_test = prep.getTerm(result_prep_test)

In [12]:
ex = extraction()
term = list(set(term_train+term_test))
tf_train = ex.getTF(result_prep_train, term)
tf_test = ex.getTF(result_prep_test, term)

NameError: name 'term_train' is not defined

In [None]:
idf = ex.getIDF(np.concatenate((tf_train, tf_test), axis=0))

In [None]:
tf_idf_train = ex.getTFIDF(tf_train, idf)
tf_idf_test = ex.getTFIDF(tf_test, idf)

In [48]:
# np.concatenate((tf_idf_test, tf_idf_train), axis=0).shape

In [15]:
# np.concatenate((data_test.sentimen, data_train.sentimen), axis=0).shape

(811,)

In [21]:
# tf_idf_train.shape tf_idf_test.shape

((30, 394), (9, 394))

In [145]:
target = np.concatenate((data_train.sentimen, data_test.sentimen), axis=0)
concat_tfidf = np.concatenate((tf_idf_test, tf_idf_train), axis=0)
information_value = getIVnew(concat_tfidf, term, target)

In [18]:
# concat_tfidf.shape

(39, 394)

In [146]:
indeks = feature_selection(term, information_value)

In [147]:
# X_train, X_test, y_train, y_test = train_test_split(tf_idf_train, data_train.loc[:,["jasa","sentimen"]], test_size=1, random_state=42)
# X_train, y_train = train_test_split(tf_idf_train, data_train.loc[:,["jasa","sentimen"]], test_size=1, random_state=42)
# X_test, y_test = train_test_split(tf_idf_test, data_test.loc[:,["jasa","sentimen"]], test_size=1, random_state=42)
X_train = tf_idf_train[:,indeks]
X_test = tf_idf_test[:,indeks]
y_train = data_train.loc[:,["jasa","sentimen"]]
y_test = data_test.loc[:,["jasa","sentimen"]]

In [148]:
sm = SMOTEENN()
X_data_sampling, y_data_sampling = sm.fit_resample(X_train, y_train.sentimen)

In [149]:
knn_model = knn()
knn_model.fit(X_train, y_train.sentimen)

In [150]:
# y_pred = knn_model.predict(X_test,k=3)
y_pred = knn_model.predict(X_test,k=3)

In [151]:
y_pred_arr = np.array(y_pred)
sum_of_negative = np.count_nonzero(y_pred_arr == -1)
sum_of_positive = np.count_nonzero(y_pred_arr == 1)
prcentage_negative = sum_of_negative/len(y_pred_arr)*100
percentage_positive = sum_of_positive/len(y_pred_arr)*100

In [152]:
percentage_positive, prcentage_negative

(1.8633540372670807, 98.13664596273291)

In [139]:
confusion_matrix(np.array(y_test.sentimen), np.array(y_pred)).ravel()

(array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1]),
 array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1]))

In [166]:
tn, fp, fn, tp = confusion_matrix(np.array(y_test.sentimen), np.array(y_pred)).ravel()
conf_matriks = confusion_matrix(np.array(y_test.sentimen), np.array(y_pred))
# conf_matriks.shape
conf_matriks_positive = tp/(np.sum([[90, 0], [0, 0]]))
conf_matriks_negative = tn/(np.sum([[90, 0], [0, 0]]))

In [167]:
conf_matriks

array([[158,   3],
       [  0,   0]])

In [168]:
tn, fp, fn, tp

(158, 3, 0, 0)

In [54]:
np.array(y_test.sentimen)

array([ 1,  1, -1, -1, -1,  1,  1,  1, -1, -1])

In [23]:
np.array(y_pred)

array([ 1, -1,  1,  1, -1, -1, -1, -1,  1, -1])

In [130]:
conf_matriks_positive, conf_matriks_negative

(0.03333333333333333, 0.9333333333333333)

In [156]:
precision =  precision_score(y_test.sentimen, y_pred) 

In [157]:
recall = recall_score(y_test.sentimen, y_pred)

In [158]:
f_measure = 2 * ( (precision * recall) / (precision + recall) )
# f_measure = 2*tp / ((tp+fp) * (tp+fn))

In [159]:
f_measure

nan

In [160]:
precision

0.0

In [161]:
recall

0.0

In [164]:
accuracy = (tp+tn)/(tp+tn+fp+fn)

In [165]:
accuracy

0.9813664596273292