# Import Libraries

In [1]:
# Import Libraries
import os
import pickle
import warnings
import pandas as pd
import numpy as np
import ast
import re
import string
import preprocessor as p
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from gensim.models.fasttext import FastText


warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None
nltk.download('stopwords')
nltk.download('punkt')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
listStopword =  set(stopwords.words('indonesian'))  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jpawitro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jpawitro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Configs

In [2]:
data = os.path.join("..","data")
model = os.path.join("..","model")

In [3]:
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

emoticons = emoticons_happy.union(emoticons_sad)

file_slang  = open(os.path.join("..","data","cleaning_source","update_combined_slang_words.txt"), "r")
content = file_slang.read()
slang_words = ast.literal_eval(content)

# Load Model

In [4]:
clf = pickle.load(open(os.path.join(model,"randomforest.sav"),"rb"))
le = pickle.load(open(os.path.join(model,"le.sav"),"rb"))
dfeat = pd.read_json(os.path.join(model,"feature.json")).sort_index()
ftmod = FastText.load(os.path.join(model,"fasttext.bin"))

# Read Data

In [5]:
df = pd.read_csv(os.path.join(data,"Data_inferences.csv"),sep=",", encoding='cp1252')
df

Unnamed: 0,User,Comment
0,fida310,MaasyaaAllah si kaseep
1,nrxzra_,"Buat yang komen jahat, semoga balik ke diri se..."
2,xxyouria_,"kok orang2 jahat banget ya ketikannya, ga nger..."
3,astriivo08,Kok lama lama mirip Tukul ya
4,taldv_,"Ksini krn Twitter, I'm not a leslar fans, tapi..."
5,somedayssi17,Ih warga ig serem serem trnyata komenannya smg...
6,mirnha10_,Ku kira monyet ternayata dajal
7,vn_rajjjj,Muka lesty kayak hantu di film IT
8,murni_sulistiani26,Kok tambah punya anak tambah jelek sihâ€¦â€¦.
9,fanykandow,Mirip daus mini anaknya


# Class & Functions

In [6]:
def preprocessing(data, listtoberemoved = []):
    cleaned = []
    for n in data:
        n = p.clean(n)
        n = n.lower()
        n = re.sub(r':', '', n)
        n = re.sub(r'‚Ä¶', '', n)
        n = re.sub(r'[^\x00-\x7F]+',' ', n)
        n = emoji_pattern.sub(r'', n)
        n = re.sub('[^a-zA-Z]', ' ', n)
        n = re.sub("&lt;/?.*?&gt;","&lt;&gt;",n)
        n = re.sub("(\\d|\\W)+"," ",n)
        n = re.sub(r'â', '', n)
        n = re.sub(r'€', '', n)
        n = re.sub(r'¦', '', n)
        cleaned.append(n)

    tokenized = []
    for n in cleaned:
        n = word_tokenize(n)
        for w in n:
            if w in slang_words.keys():
                n[n.index(w)] = slang_words[w]
        tokenized.append(n)

    removed = []
    for ts in tokenized:
        n = []
        for t in ts:
            if t not in listtoberemoved and t not in listStopword and t not in emoticons and t not in string.punctuation:
                n.append(t)
        removed.append(n)

    stemmed = []
    for n in removed:
        n = ' '.join(n)
        n = stemmer.stem(n)
        n = n.split(' ')
        stemmed.append(n)
    return stemmed

In [7]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix

        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}

        for word in temp_vocab:
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

# Predict

In [8]:
prep = preprocessing(df['Comment'].tolist())

sequencer = Sequencer(all_words = [token for seq in prep for token in seq],
          max_words = 1200,
          seq_len = 15,
          embedding_matrix = ftmod.wv
        )

x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in prep])
X = x_vecs[:,dfeat.index]
X = pd.DataFrame(X)
pred = clf.predict(X)

# Results

In [9]:
df.assign(Class = pd.DataFrame(le.inverse_transform(pred)))

Unnamed: 0,User,Comment,Class
0,fida310,MaasyaaAllah si kaseep,Non-bully
1,nrxzra_,"Buat yang komen jahat, semoga balik ke diri se...",Non-bully
2,xxyouria_,"kok orang2 jahat banget ya ketikannya, ga nger...",Non-bully
3,astriivo08,Kok lama lama mirip Tukul ya,Bully
4,taldv_,"Ksini krn Twitter, I'm not a leslar fans, tapi...",Non-bully
5,somedayssi17,Ih warga ig serem serem trnyata komenannya smg...,Bully
6,mirnha10_,Ku kira monyet ternayata dajal,Bully
7,vn_rajjjj,Muka lesty kayak hantu di film IT,Bully
8,murni_sulistiani26,Kok tambah punya anak tambah jelek sihâ€¦â€¦.,Bully
9,fanykandow,Mirip daus mini anaknya,Bully


In [13]:
def detect(text):
    text = [text]
    prep = preprocessing(text)
    sequencer = Sequencer(all_words = [token for seq in prep for token in seq],
            max_words = 1200,
            seq_len = 15,
            embedding_matrix = ftmod.wv
            )

    x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in prep])
    X = x_vecs[:,dfeat.index]
    X = pd.DataFrame(X)
    pred = clf.predict(X)
    return le.inverse_transform(pred)[0]
    

In [14]:
detect("jelek amat muka lu")

'Bully'