In [134]:
import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from googletrans import Translator, LANGUAGES
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from googletrans import Translator, LANGUAGES
np.seterr(divide = 'ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [135]:
class preprocessing:
    def __init__(self):
        def normalisasi(norm):
            dt = {}
            for i in range(len(norm)):
                dt[norm.loc[i,"kata typo"]] = norm.loc[i,"kata normal"]
            return dt
        self.__kamusNorm = normalisasi(pd.read_csv("./data/components/word-normalization.csv", delimiter=";"))
        self.__stopword = list(pd.read_csv("./data/components/stopword.csv",header=None).loc[:,0])

    def token(self,text:list):
        output = []
        for i in range(len(text)):
            output.append(text[i].split(" "))
        return output
    
    def stem(self,text: list):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        output = []
        # stemming process
        for i in range(len(text)):
            sentence = text[i]
            is_list = False
            if isinstance(sentence, list):
                sentence = " ".join(sentence)
                is_list = True
            temp   = stemmer.stem(sentence)
            if is_list:
                temp = temp.split(" ")
            output.append(temp)
        return output

    def filtering(self, text: list):
        output = []
        for i in range(len(text)):
            temp = []
            for j in range(len(text[i])):
                if text[i][j] not in self.__stopword:
                    temp.append(text[i][j])
            output.append(temp)
        return output
    
    def removeLink(self, data:list):
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        for i in range(len(data)):
            data[i] = re.sub(regex, " ", data[i])
        return data
    
    def normalization(self,data):
        length_data = np.array(data).shape
        for i in range(length_data[0]):
            is_str= isinstance(data[i], str)
            temp = data[i]
            if is_str:
                temp = data[i].split(" ")
                
            for j in range(len(temp)):
                temp[j] = self.__kamusNorm.get(temp[j],temp[j])
            if is_str:
                temp = " ".join(temp)
            data[i] = temp
        return data
    
    def prep_data(self,data):
        data = self.removeLink(data)
        token_data = self.token(data)
        filter_data = self.filtering(token_data)
        stem_data = self.stem(filter_data)
        return stem_data
    
    def getTerm(self, text:list):
        output=[]
        for i in range(len(text)):
            for j in range(len(text[i])):
                if text[i][j] not in output:
                    output.append(text[i][j])
        return output
    
    def translate(self,text:list):
        trans = Translator(service_urls=['translate.google.com',"translate.google.co.id"])
        output = []
        for i in range(len(text)):
            sentence = text[i]
            is_arr = isinstance(sentence, list)
            if is_arr:
                sentence = " ".join(sentence)
            temp = trans.translate(sentence, dest='id').text
            if is_arr:
                temp = temp.split(" ")
            output.append(temp)
        return output

In [136]:
class extraction:
    def __init__(self):
        self.__data = []
        
    def getTF(self,text:list, term:list, tfWeight=True):
        output = []
        if not (isinstance(text, list) or isinstance(text, np.array)):
            raise " "
        for i in range(len(text)):
            doc = text[i]
            if isinstance(text[i], list):
                doc = " ".join(doc)
            tf_doc = []
            for j in range(len(term)):
                temp = doc.count(term[j])
                tf_doc.append(temp)
            output.append(tf_doc)
        output = np.array(output)
        #w_tf=1+log(tf) tf>0, tf=0: 0
        if tfWeight:
            output = np.where(output>0,1+np.log10(output),0)
        return output

    def getIDF(self,tf_doc:list):
        # makan | minum | tidur
        # 1     | 0     | 1      
        # 0     | 0     | 0
        countNZero = np.count_nonzero(tf_doc, axis=0)
        n_doc = tf_doc.shape[0]
        return np.where(countNZero != 0 ,np.log10(n_doc/countNZero),0)

    def __normalized_vector(self,tfidf, train=False):
        def get_length(tfidf):
            pow_tfidf = np.power(tfidf,2)
            sum_tfidf = np.sum(pow_tfidf, axis=1)
            return np.sqrt(sum_tfidf)
        length_tfidf = get_length(tfidf)
        norm = tfidf.T/np.where(length_tfidf==0,1,length_tfidf)
        where_are_NaNs = np.isnan(norm)
        norm[where_are_NaNs] = 0
        return norm.T
    
    def getTFIDF(self,tf, idf):
        tfidf = tf*idf
        return self.__normalized_vector(tfidf,True)

class knn:
    def __init__(self):
        self.__data = []
        self.__target = []
    
    def fit(self, x:list,y:list):
        if x.ndim!=2:
            raise "Error data Train is not 2dimention"
        self.__data = x
        self.__target = y
    
    def __cossim(self,tfidf_train, tfidf_test):
        if tfidf_test.ndim == 2:
            res = []
            for i in range(tfidf_test.shape[0]):
                res.append(np.dot(tfidf_train, tfidf_test[i]))
            return np.array(res)
        return np.array([np.dot(tfidf_train, tfidf_test)])
    
    def predict(self, x_norm:list, k:int=3, norm=True):
        if len(self.__data)==0:
            raise "Error please input data train"
        result = self.__cossim(self.__data, x_norm)
        target = []
        for i in range(len(result)):
            hasil = sorted(zip(result[i],self.__target), key=lambda pair: pair[0], reverse=True)[:k]
            hsl = dict(Counter([j for i,j in hasil]))
            if hsl.get(1,0) > hsl.get(-1,0):
                target.append(1)
            elif hsl.get(1,0) < hsl.get(-1,0):
                target.append(-1)
            elif hsl.get("1",0) > hsl.get("-1",0):
                target.append(1)
            elif hsl.get("1",0) < hsl.get("-1",0):
                target.append(-1)
            else:
                target.append(0)
        return target

In [137]:
prep = preprocessing()

In [138]:
data_read = pd.read_csv("./data/tests/posindonesia_kualitas.csv", delimiter=";")

In [139]:
#kecepatan_tiki = kecepatan_data.loc[kecepatan_data.loc[:,"jasa"] =="tiki",]
#kecepatan_jne = kecepatan_data.loc[kecepatan_data.loc[:,"jasa"] =="jne",]
data = data_read

In [144]:
result_prep = prep.prep_data(list(data.tweet))
term = prep.getTerm(result_prep)

TypeError: expected string or bytes-like object

In [121]:
ex = extraction()
tf = ex.getTF(result_prep, term)

In [122]:
idf = ex.getIDF(tf)

In [123]:
tf_idf = ex.getTFIDF(tf,idf)
tf_idf

array([[0.21981901, 0.09820078, 0.11805237, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.51952328],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [124]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf, data.loc[:,["jasa","sentimen"]], test_size=0.30, random_state=42)

In [125]:
knn_model = knn()
knn_model.fit(X_train, y_train.sentimen)

In [126]:
y_pred = knn_model.predict(X_test,k=3)

In [127]:
conf_matriks = confusion_matrix(y_test.sentimen, y_pred)
conf_matriks
# conf_matriks.shape
percentage_positive = conf_matriks[0][0]/(np.sum(conf_matriks))
if (conf_matriks.shape == (2, 2)):
    percentage_negative = conf_matriks[1][1]/(np.sum(conf_matriks))
else:
    percentage_negative = 0

In [128]:
percentage_positive, percentage_negative

(0.9555555555555556, 0.0)

In [129]:
precision_score(y_test.sentimen, y_pred) 

0.0

In [130]:
recall_score(y_test.sentimen, y_pred)

0.0

In [131]:
conf_matriks

array([[86,  0],
       [ 4,  0]])

In [132]:
y_test

Unnamed: 0,jasa,sentimen
281,PosIndonesia,-1
265,PosIndonesia,-1
164,PosIndonesia,-1
9,PosIndonesia,-1
77,PosIndonesia,-1
...,...,...
132,PosIndonesia,-1
72,PosIndonesia,-1
15,PosIndonesia,-1
10,PosIndonesia,-1


In [133]:
y_train

Unnamed: 0,jasa,sentimen
224,PosIndonesia,-1
68,PosIndonesia,-1
222,PosIndonesia,-1
37,PosIndonesia,-1
16,PosIndonesia,-1
...,...,...
188,PosIndonesia,-1
71,PosIndonesia,-1
106,PosIndonesia,-1
270,PosIndonesia,1
