# Sentiment Analysis ID
## Import Global Library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re, string

## Import Dataset
### Meta Data
- **Nama Instagram** - Comment owner
- **Komentar** - Comment from comment owner, this will be feature predictor
- **Kategori** - Label comment for data training
- **Nama Akun IG Artis** - Commented person

In [6]:
df = pd.read_excel('Instagram Cyber Bullying.xlsx', sheet_name='Sheet1')
df.head(10)

Unnamed: 0,No.,Nama Instagram,Komentar,Kategori,Tanggal Posting,Nama Akun IG Artis/Selebgram,Unnamed: 6,Unnamed: 7
0,1,@delliananda,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
1,2,@fenninbl,"""makan nasi padang aja begini badannya""",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
2,3,@abdurahmanshq,"""yang aku suka dari dia adalah selalu cukur je...",Bullying,14 Oktober 2019,@isyanasarasvati,,
3,4,@najla.yoo,"""Hai kak Isyana aku ngefans banget sama kak Is...",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
4,5,@dessy_______,"""Manusia apa bidadari sih herann deh cantik te...",Non-bullying,14 Oktober 2019,@isyanasarasvati,,
5,6,@e.fril,"""@ayu.kinantii isyan skrg berubah ya:( baju ny...",Bullying,14 Oktober 2019,@isyanasarasvati,,
6,7,@bahasa.bayi.planet,"""Gemesnya isyan kayak tango, berlapis lapis ci...",Non-bullying,16 September 2019,@isyanasarasvati,,
7,8,@khanayarudinita,"""Makin jelek aja anaknya, padahal ibu ayahnya ...",Bullying,22 Juni 2019,@tasyakamila,,
8,9,@reniaulia225,"""Kok anaknya kayak udah tua gitu ya mukanya kk...",Bullying,22 Juni 2019,@tasyakamila,,
9,10,@nurjanah.hani,"""Muka anak nya ko tua banget yaa.. GK ngegemes...",Bullying,22 Juni 2019,@tasyakamila,,


In [7]:
# Drop unused feature
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.drop(['No.', 'Tanggal Posting'], axis=1, inplace=True)

# Show info dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 4 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Nama Instagram                 650 non-null    object
 1   Komentar                       650 non-null    object
 2   Kategori                       650 non-null    object
 3   Nama Akun IG  Artis/Selebgram  650 non-null    object
dtypes: object(4)
memory usage: 20.4+ KB


In [8]:
# Show freq of Categorical feature
df.describe()

Unnamed: 0,Nama Instagram,Komentar,Kategori,Nama Akun IG Artis/Selebgram
count,650,650,650,650
unique,631,650,2,19
top,@rubybee_16,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",Non-bullying,@eeericko
freq,5,1,325,266


Non-bullying and Bullying have same freq 50:50

In [9]:
df

Unnamed: 0,Nama Instagram,Komentar,Kategori,Nama Akun IG Artis/Selebgram
0,@delliananda,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",Non-bullying,@isyanasarasvati
1,@fenninbl,"""makan nasi padang aja begini badannya""",Non-bullying,@isyanasarasvati
2,@abdurahmanshq,"""yang aku suka dari dia adalah selalu cukur je...",Bullying,@isyanasarasvati
3,@najla.yoo,"""Hai kak Isyana aku ngefans banget sama kak Is...",Non-bullying,@isyanasarasvati
4,@dessy_______,"""Manusia apa bidadari sih herann deh cantik te...",Non-bullying,@isyanasarasvati
...,...,...,...,...
645,@_sigesrek,"""aku memutuskan untuk menjadi fans isyana. gil...",Non-bullying,@isyanasarasvati
646,@safronlux.id,"""AMZING ISYANAA!! Jujur aku amazed banget deng...",Non-bullying,@isyanasarasvati
647,@rikzikmuktyana,"""paling ngiri liat orang keren maen alat musik...",Non-bullying,@isyanasarasvati
648,@antoniusbennys,"""Sampe ga bisa berkata2 lagi buat isyana, sang...",Non-bullying,@isyanasarasvati


## Text Pre-processing
### Import Library and Function

In [10]:
import nltk

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary
from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class DataPreprocessing:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    # THIS FUNCTION IS FOR MANIPULATING VALUE OF FEATURE
    def binarizer(self, feature, positive, new = None):
        def toBinary(text, positive):
            if text == positive:
                text = 1
            else:
                text = 0
            return text

        sentence = []
        for index, row in self.dataframe.iterrows():
            sentence.append(toBinary(row[feature], positive))

        if new == None:
            self.dataframe[feature] = sentence
        else:
            self.dataframe[new] = sentence

    # THIS FUNCTION FOR PREPROCESSING TEXT IN DATAFRAME
    def remove_signs(self, feature, new = None):
        def delSign(text):
            # Remove number in string
            text = re.sub(r'[0-9]+', '', text)
            # Remove tab, new line, double space and back slice
            text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"").replace('\s+', " ")
            # Remove non ASCII (emoticon, chinese word, .etc)
            text = text.encode('ascii', 'replace').decode('ascii')
            # Remove mention, link, hashtag
            text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
            # Remove incomplete URL
            text = text.replace("http://", " ").replace("https://", " ")
            # Remove doublespace and doubletick
            return text.replace('"', "").replace("'", "").replace("  ", " ")

        sentence = []
        for index, row in self.dataframe.iterrows():
            sentence.append(delSign(row[feature]))
        if new == None:
            self.dataframe[feature] = sentence
        else:
            self.dataframe[new] = sentence
    
    def remove_stopwords(self, feature, new = None):
        factory = StopWordRemoverFactory()

        # You can custom stopwords list below, we will use stopword custom for remove stopword
        stopword_custom =["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja', 'nyg', 'hehe', 'pen', 'nan', 'loh','&amp', 'yah']
        stopword_extend = pd.read_csv("stopwordsID.csv")
        stopword_custom.extend(stopword_extend)

        # Add custom stopword to sastrawi and convert to dictionary
        stopword_sastrawi = factory.get_stop_words()+stopword_custom
        dictionary = ArrayDictionary(stopword_sastrawi)

        # Create StopWordRemover Function and add custom stopwords list
        stopword = StopWordRemover(dictionary)

        sentence = []
        for index, row in self.dataframe.iterrows():
            sentence.append(stopword.remove(row[feature]))

        if new == None:
            self.dataframe[feature] = sentence
        else:
            self.dataframe[new] = sentence

    def text_stemming(self, feature, new = None):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        def stemming(text):
            text = stemmer.stem(text)
            return text
        
        sentence = []
        for index, row in self.dataframe.iterrows():
            sentence.append(stemming(row[feature]))

        if new == None:
            self.dataframe[feature] = sentence
        else:
            self.dataframe[new] = sentence
    
    # Lazy Preprocessing
    def text_preprocessing(self, feature, new = None):
        self.remove_signs(feature, new)
        self.remove_stopwords(new, new)
        self.text_stemming(new, new)


In [11]:
# Init
comment_preprocessing = DataPreprocessing(df)

# Change value of df['Kategori'] to binary
comment_preprocessing.binarizer('Kategori', 'Non-bullying', 'Kategori Binary')

# Remove sign or punctuation in df['Komentar']
comment_preprocessing.text_preprocessing('Komentar', 'Clean Komentar')

In [12]:
df

Unnamed: 0,Nama Instagram,Komentar,Kategori,Nama Akun IG Artis/Selebgram,Kategori Binary,Clean Komentar
0,@delliananda,"""Kaka tidur yaa, udah pagi, gaboleh capek2""",Non-bullying,@isyanasarasvati,1,kaka tidur yaa udah pagi gaboleh capek
1,@fenninbl,"""makan nasi padang aja begini badannya""",Non-bullying,@isyanasarasvati,1,makan nasi padang badan
2,@abdurahmanshq,"""yang aku suka dari dia adalah selalu cukur je...",Bullying,@isyanasarasvati,0,suka cukur jembut manggung
3,@najla.yoo,"""Hai kak Isyana aku ngefans banget sama kak Is...",Non-bullying,@isyanasarasvati,1,hai kak isyana ngefans banget kak isyana aku s...
4,@dessy_______,"""Manusia apa bidadari sih herann deh cantik te...",Non-bullying,@isyanasarasvati,1,manusia bidadari herann deh cantik
...,...,...,...,...,...,...
645,@_sigesrek,"""aku memutuskan untuk menjadi fans isyana. gil...",Non-bullying,@isyanasarasvati,1,putus fans isyana gila keren banget ya otak du...
646,@safronlux.id,"""AMZING ISYANAA!! Jujur aku amazed banget deng...",Non-bullying,@isyanasarasvati,1,amzing isyanaa jujur amazed banget skill minat...
647,@rikzikmuktyana,"""paling ngiri liat orang keren maen alat musik...",Non-bullying,@isyanasarasvati,1,ngiri liat keren maen alat musik dalem banget ...
648,@antoniusbennys,"""Sampe ga bisa berkata2 lagi buat isyana, sang...",Non-bullying,@isyanasarasvati,1,sampe isyana sempurna org


## Data Training
### Import Library and Function

In [13]:
from sklearn.model_selection import train_test_split

### Split data train an data test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['Clean Komentar'], df['Kategori Binary'], shuffle=True, test_size=0.25, stratify=df['Kategori Binary'], random_state=30)

### Count Vectorize and TF-IDF

In [15]:
# Count Vectorize
count_vector = CountVectorizer(max_features=10000)
count_vector.fit_transform(X_train)

# TF-IDF
tfidf_vector = TfidfVectorizer(max_features=10000)
tfidf_vector.fit_transform(X_train)

# I will choose TF-IDF for training and testing
X_train = tfidf_vector.transform(X_train)
X_test = tfidf_vector.transform(X_test)

## Modelling
### Import Library and Function

In [16]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Init for SVM
svm = svm.SVC(kernel="linear")

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    # Init List
    fpr, tpr, thresholds = roc_curve (ytest, pred, pos_label=1) # pos_label is positive label for metric, default is 1
    acc = "%.4f" % accuracy_score(ytest, pred)
    prc = "%.4f" % precision_score(ytest, pred)
    rec = "%.4f" % recall_score(ytest, pred)
    f1s = "%.4f" % f1_score(ytest, pred)
    aucs = "%.4f" % auc(fpr, tpr)
    data = [['Accuracy', acc], ['Precision', prc], ['Recall', rec], ['F1-Score', f1s], ['AUC', aucs]]

    # Create DataFrame
    summary = pd.DataFrame(data, columns=['Metric', 'Value'])

    return summary

### Logistic Regression

In [17]:
# Init
logreg = LogisticRegression()

imp_logreg = logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

summary = eval_classification(imp_logreg, logreg_pred, X_train, y_train, X_test, y_test)
summary

Unnamed: 0,Metric,Value
0,Accuracy,0.8282
1,Precision,0.8462
2,Recall,0.8049
3,F1-Score,0.825
4,AUC,0.8284


### KNN

In [18]:
k=10
knn = KNeighborsClassifier(n_neighbors=k)

imp_knn = knn.fit(X_train, y_train)
knn_pred = imp_knn.predict(X_test)

summary = eval_classification(imp_knn, knn_pred, X_train, y_train, X_test, y_test)
summary

Unnamed: 0,Metric,Value
0,Accuracy,0.7853
1,Precision,0.8615
2,Recall,0.6829
3,F1-Score,0.7619
4,AUC,0.7859


### SVM

In [19]:
imp_svm = svm.fit(X_train, y_train)
svm_pred = imp_svm.predict(X_test)

summary = eval_classification(imp_svm, svm_pred, X_train, y_train, X_test, y_test)
summary

Unnamed: 0,Metric,Value
0,Accuracy,0.816
1,Precision,0.8171
2,Recall,0.8171
3,F1-Score,0.8171
4,AUC,0.8159


## Data Testing
In here you can input whatever you want, machine will predicting are you bullying or non-bullying
### Import Library and Function

In [23]:
class TextPreprocessing:
    def __init__(self, text, vector):
        self.text = text
        self.vector = vector

    # THIS FUNCTION FOR PREPROCESSING TEXT IN DATAFRAME
    def remove_signs(self, text):
        # Remove number in string
        text = re.sub(r'[0-9]+', '', text)
        # Remove tab, new line, double space and back slice
        text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"").replace('\s+', " ")
        # Remove non ASCII (emoticon, chinese word, .etc)
        text = text.encode('ascii', 'replace').decode('ascii')
        # Remove mention, link, hashtag
        text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
        # Remove incomplete URL
        text = text.replace("http://", " ").replace("https://", " ")
        # Remove doublespace and doubletick
        return text.replace('"', "").replace("'", "").replace("  ", " ")
    
    def remove_stopwords(self, text):
        factory = StopWordRemoverFactory()

        # You can custom stopwords list below, we will use stopword custom for remove stopword
        stopword_custom =["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja', 'nyg', 'hehe', 'pen', 'nan', 'loh','&amp', 'yah']
        stopword_extend = pd.read_csv("stopwordsID.csv")
        stopword_custom.extend(stopword_extend)

        # Add custom stopword to sastrawi and convert to dictionary
        stopword_sastrawi = factory.get_stop_words()+stopword_custom
        dictionary = ArrayDictionary(stopword_sastrawi)

        # Create StopWordRemover Function and add custom stopwords list
        stopword = StopWordRemover(dictionary)

        text = stopword.remove(text)
        return text

    def text_stemming(self, text):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        text = stemmer.stem(text)
        return text

    def vectorizer(self, text, vector = tfidf_vector) :
        output = vector.transform([text])
        return output

    # Lazy Preprocessing
    def text_preprocessing(self):
        proc_sign = self.remove_signs(self.text)
        proc_stop = self.remove_stopwords(proc_sign)
        proc_stem = self.text_stemming(proc_stop)
        proc_vect = self.vectorizer(proc_stem, self.vector)
        return proc_vect

### Testing Logistic Regression

In [24]:
komentar = input("Comment: ")

# Init Preprocessing
process = TextPreprocessing(komentar, tfidf_vector)
text = process.text_preprocessing()

# Predicting Text
result_logreg = imp_logreg.predict(text)
if result_logreg[0] == 0:
    print('Bullying')
else:
    print('Non-Bullying')

Bullying


### Testing KNN

In [25]:
komentar = input("Comment: ")

# Init Preprocessing
process = TextPreprocessing(komentar, tfidf_vector)
text = process.text_preprocessing()

# Predicting Text
result_knn = imp_knn.predict(text)
if result_knn[0] == 0:
    print('Bullying')
else:
    print('Non-Bullying')

Non-Bullying


### Testing SVM

In [26]:
komentar = input("Comment: ")

# Init Preprocessing
process = TextPreprocessing(komentar, tfidf_vector)
text = process.text_preprocessing()

# Predicting Text
result_svm = imp_svm.predict(text)
if result_svm[0] == 0:
    print('Bullying')
else:
    print('Non-Bullying')

Bullying


Sometime predict can be false because machine just learn minidataset. I will try to add some data next time. Happy learning! :D