# Text Classification Model

## Load Data

In [2]:
import pandas as pd

dataset = pd.read_csv('../data/dataset_tweet_sentimen_tayangan_tv.csv')

dataset

Unnamed: 0,Id,Sentiment,Acara TV,Text Tweet
0,1,positive,HitamPutihTransTV,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S..."
1,2,positive,HitamPutihTransTV,Selamat berbuka puasa Semoga amal ibadah hari ...
2,3,positive,HitamPutihTransTV,"Ada nih di trans7 hitam putih, dia dpt penghar..."
3,4,positive,HitamPutihTransTV,selamat ya mas @adietaufan masuk hitamputih
4,5,positive,HitamPutihTransTV,Asiknya nonton Hitam Putih Trans7
...,...,...,...,...
395,396,negative,MataNajwaMetroTV,ini apa banget deh gw paling kesel klo orang2 ...
396,397,negative,MataNajwaMetroTV,Orang miskin semakin miskin klo sekolah melaku...
397,398,negative,MataNajwaMetroTV,"ga boLeh emosi, cepat tua, nonton #matanajwame..."
398,399,negative,MataNajwaMetroTV,dr penampilan saja kyk preman taunya bkin kisr...


In [3]:
dataset['Acara TV'].unique()

array(['HitamPutihTransTV', 'IndonesiaLawyersClubTvOne',
       'KickAndyMetroTV', 'MataNajwaMetroTV'], dtype=object)

In [5]:
dataset.groupby(['Acara TV', 'Sentiment'])['Id'].count()

Acara TV                   Sentiment
HitamPutihTransTV          negative     50
                           positive     50
IndonesiaLawyersClubTvOne  negative     50
                           positive     50
KickAndyMetroTV            negative     50
                           positive     50
MataNajwaMetroTV           negative     50
                           positive     50
Name: Id, dtype: int64

## Data Preparation & Pre-processing

In [6]:
dataset.isnull().sum()

Id            0
Sentiment     0
Acara TV      0
Text Tweet    0
dtype: int64

In [7]:
import preprocessor as p

clean_text = []

for text in dataset['Text Tweet']:
    clean_text.append(p.clean(text))

### Lowercasing

Mengubah semua huruf menjadi huruf kecil semua untuk mengurangi variansi data.

In [8]:
lower_text=[]

for text in clean_text:
    lower_text.append(text.lower())

In [9]:
import re

def remove_punct(text):
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text

In [10]:
no_punct_text=[]

for text in lower_text:
    no_punct_text.append(remove_punct(text))

In [12]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary, StopWordRemover

# membuat object untuk menghilankan stopwords
stop_factory = StopWordRemoverFactory()

# membuat objek stopwords
stopword = stop_factory.create_stop_word_remover()

# membuat list kosong untuk menyimpan hasil
no_stopwords_text = []

# membuit loop untuk menghilangkan stopwords
for text in no_punct_text:
    no_stopwords_text.append(stopword.remove(text))

# melihat hasil
no_stopwords_text[0]

'undang hitamputih pemenang ssk jkt48 harusnya mjkt48 lebih layak undang prestasinya'

In [13]:
# import library untuk mengembalikan ke dalam bentuk kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# membuat fungsi untuk stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat list kosong untuk menyimpan hasil
stemmed_text = []

# melakukan looping untuk melakukan stemming pada setiap elemen
for text in no_stopwords_text:
    stemmed_text.append(stemmer.stem(text))

# melihat hasil stemming
stemmed_text[0]

'undang hitamputih menang ssk jkt48 harus mjkt48 lebih layak undang prestasi'

In [17]:
dataset['cleaned_text'] = stemmed_text

dataset.head()

Unnamed: 0,Id,Sentiment,Acara TV,Text Tweet,cleaned_text
0,1,positive,HitamPutihTransTV,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S...",undang hitamputih menang ssk jkt48 harus mjkt4...
1,2,positive,HitamPutihTransTV,Selamat berbuka puasa Semoga amal ibadah hari ...,selamat buka puasa moga amal ibadah hari ni te...
2,3,positive,HitamPutihTransTV,"Ada nih di trans7 hitam putih, dia dpt penghar...",nih trans7 hitam putih dpt harga di norwegia
3,4,positive,HitamPutihTransTV,selamat ya mas @adietaufan masuk hitamputih,selamat mas masuk hitamputih
4,5,positive,HitamPutihTransTV,Asiknya nonton Hitam Putih Trans7,asiknya nonton hitam putih trans7


In [15]:
token=[]

for text in stemmed_text:
    token.extend(text.split())

In [16]:
token

['undang',
 'hitamputih',
 'menang',
 'ssk',
 'jkt48',
 'harus',
 'mjkt48',
 'lebih',
 'layak',
 'undang',
 'prestasi',
 'selamat',
 'buka',
 'puasa',
 'moga',
 'amal',
 'ibadah',
 'hari',
 'ni',
 'terima',
 'allah',
 'nih',
 'trans7',
 'hitam',
 'putih',
 'dpt',
 'harga',
 'di',
 'norwegia',
 'selamat',
 'mas',
 'masuk',
 'hitamputih',
 'asiknya',
 'nonton',
 'hitam',
 'putih',
 'trans7',
 'acara',
 'paling',
 'komplit',
 'tarik',
 'ada',
 'hitam',
 'putih',
 'hitam',
 'putih',
 't7',
 'inspiratif',
 'banget',
 'suka',
 'banget',
 'acara',
 'hitam',
 'putih',
 'keren',
 'lu',
 'bro',
 'tadi',
 'yg',
 'liat',
 'hitam',
 'putih',
 'trans7',
 'ga',
 'sanggu',
 'ganteng',
 'cinta',
 'ikat',
 'silaturahmi',
 'hati',
 'terima',
 'kasih',
 'pak',
 'mau',
 'bantu',
 'untuk',
 'sekolah',
 'adik',
 'moga',
 'lancar',
 'hitamputihtrans7',
 'trans7',
 'hitam',
 'putih',
 'baik',
 'acara',
 'hitam',
 'putih',
 'paling',
 'bagus',
 'buat',
 'lihat',
 'undang',
 'acara',
 'hitam',
 'putih',
 'yadia'

## Data Exploration

In [19]:
import itertools

all_words = list(itertools.chain(token))

all_words

['undang',
 'hitamputih',
 'menang',
 'ssk',
 'jkt48',
 'harus',
 'mjkt48',
 'lebih',
 'layak',
 'undang',
 'prestasi',
 'selamat',
 'buka',
 'puasa',
 'moga',
 'amal',
 'ibadah',
 'hari',
 'ni',
 'terima',
 'allah',
 'nih',
 'trans7',
 'hitam',
 'putih',
 'dpt',
 'harga',
 'di',
 'norwegia',
 'selamat',
 'mas',
 'masuk',
 'hitamputih',
 'asiknya',
 'nonton',
 'hitam',
 'putih',
 'trans7',
 'acara',
 'paling',
 'komplit',
 'tarik',
 'ada',
 'hitam',
 'putih',
 'hitam',
 'putih',
 't7',
 'inspiratif',
 'banget',
 'suka',
 'banget',
 'acara',
 'hitam',
 'putih',
 'keren',
 'lu',
 'bro',
 'tadi',
 'yg',
 'liat',
 'hitam',
 'putih',
 'trans7',
 'ga',
 'sanggu',
 'ganteng',
 'cinta',
 'ikat',
 'silaturahmi',
 'hati',
 'terima',
 'kasih',
 'pak',
 'mau',
 'bantu',
 'untuk',
 'sekolah',
 'adik',
 'moga',
 'lancar',
 'hitamputihtrans7',
 'trans7',
 'hitam',
 'putih',
 'baik',
 'acara',
 'hitam',
 'putih',
 'paling',
 'bagus',
 'buat',
 'lihat',
 'undang',
 'acara',
 'hitam',
 'putih',
 'yadia'

In [22]:
import collections

count_words = collections.Counter(all_words)

count_words

Counter({'undang': 14,
         'hitamputih': 5,
         'menang': 3,
         'ssk': 1,
         'jkt48': 1,
         'harus': 4,
         'mjkt48': 1,
         'lebih': 16,
         'layak': 1,
         'prestasi': 2,
         'selamat': 6,
         'buka': 4,
         'puasa': 1,
         'moga': 13,
         'amal': 1,
         'ibadah': 4,
         'hari': 7,
         'ni': 4,
         'terima': 8,
         'allah': 3,
         'nih': 9,
         'trans7': 18,
         'hitam': 40,
         'putih': 39,
         'dpt': 1,
         'harga': 5,
         'di': 10,
         'norwegia': 1,
         'mas': 3,
         'masuk': 5,
         'asiknya': 1,
         'nonton': 29,
         'acara': 36,
         'paling': 8,
         'komplit': 1,
         'tarik': 5,
         'ada': 13,
         't7': 1,
         'inspiratif': 5,
         'banget': 18,
         'suka': 12,
         'keren': 29,
         'lu': 4,
         'bro': 1,
         'tadi': 2,
         'yg': 54,
         'liat': 10,
 

In [24]:
df_word_freq = pd.DataFrame(count_words.most_common(30),
                            columns = ['words', 'count'])

df_word_freq

Unnamed: 0,words,count
0,mata,63
1,najwa,60
2,yg,54
3,hitam,40
4,putih,39
5,acara,36
6,orang,30
7,nonton,29
8,keren,29
9,pak,29


### Negative Document

In [34]:
negative_doc = dataset[dataset['Sentiment'] == 'negative']['cleaned_text']

token_neg = []

for text in negative_doc:
    token_neg.extend(text.split())

all_words_neg = list(itertools.chain(token_neg))

all_words_neg

['hitam',
 'putih',
 'bego',
 'haruka',
 'undang',
 'bentar',
 'biar',
 'pd',
 'liat',
 'younglex',
 'udah',
 'gt',
 'malah',
 'hina',
 'bego',
 'miris',
 'liat',
 'perintah',
 'juara',
 'angkat',
 'berat',
 'seasia',
 'tp',
 'tdk',
 'biaya',
 'ikut',
 'kejurnas',
 'juara',
 'angkat',
 'berat',
 'seasia',
 'tdk',
 'terima',
 'hadiah',
 'tidak',
 'ikut',
 'kejurnas',
 'kendala',
 'biaya',
 'haruka',
 'kali',
 'kalau',
 'undang',
 'hitam',
 'putih',
 'tolak',
 'aja',
 'hostnya',
 'ga',
 'harga',
 'kamu',
 'asa',
 'aneh',
 'nonton',
 'acara',
 'hitam',
 'putih',
 'tonton',
 'nya',
 'anak',
 'yg',
 'sekolah',
 'sekolah',
 'gua',
 'sedih',
 'dengar',
 'curhatanmu',
 'roxana',
 'moga',
 'tuhan',
 'tunjuk',
 'jalan',
 'titik',
 'terang',
 'utk',
 'roxana',
 'miris',
 'anak',
 'indonesia',
 'sulit',
 'guna',
 'bahasa',
 'indonesa',
 'lebih',
 'nyaman',
 'guna',
 'bahasa',
 'asing',
 'sombong',
 'mentang2',
 'kaya',
 'pake',
 'umum',
 'host',
 'kaya',
 'hitam',
 'putih',
 'trans7',
 'parah',
 '

In [35]:
count_words = collections.Counter(all_words_neg)

df_word_freq = pd.DataFrame(count_words.most_common(30), 
                            columns=['words', 'count'])

df_word_freq

Unnamed: 0,words,count
0,yg,34
1,mata,31
2,najwa,31
3,ga,24
4,orang,24
5,jadi,17
6,tdk,15
7,lihat,15
8,aja,14
9,acara,14


### Positive Document

In [36]:
pos_doc = dataset[dataset['Sentiment']=='positive']['cleaned_text']

token_pos = []

for text in pos_doc:
    token_pos.extend(text.split())

all_words_pos = list(itertools.chain(token_pos))

count_words = collections.Counter(all_words_pos)

count_words

Counter({'undang': 9,
         'hitamputih': 5,
         'menang': 3,
         'ssk': 1,
         'jkt48': 1,
         'harus': 2,
         'mjkt48': 1,
         'lebih': 4,
         'layak': 1,
         'prestasi': 2,
         'selamat': 6,
         'buka': 4,
         'puasa': 1,
         'moga': 12,
         'amal': 1,
         'ibadah': 1,
         'hari': 6,
         'ni': 3,
         'terima': 7,
         'allah': 3,
         'nih': 8,
         'trans7': 16,
         'hitam': 31,
         'putih': 32,
         'dpt': 1,
         'harga': 2,
         'di': 3,
         'norwegia': 1,
         'mas': 3,
         'masuk': 3,
         'asiknya': 1,
         'nonton': 18,
         'acara': 22,
         'paling': 7,
         'komplit': 1,
         'tarik': 4,
         'ada': 4,
         't7': 1,
         'inspiratif': 5,
         'banget': 13,
         'suka': 10,
         'keren': 29,
         'lu': 1,
         'bro': 1,
         'tadi': 1,
         'yg': 20,
         'liat': 2,
      

In [37]:
df_word_freq = pd.DataFrame(count_words.most_common(30), 
                            columns=['words', 'count'])

df_word_freq

Unnamed: 0,words,count
0,putih,32
1,mata,32
2,hitam,31
3,keren,29
4,najwa,29
5,acara,22
6,yg,20
7,nonton,18
8,inspirasi,17
9,trans7,16


In [39]:
dataset['label'] = list(map(lambda x: 1 if x == 'positive' else 0, dataset['Sentiment']))

## Feature Engineering

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['cleaned_text'], 
                                                    dataset['label'],
                                                    random_state = 14,
                                                    test_size=0.2)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_counts = tfidf.fit_transform(X_train)
X_test_counts = tfidf.transform(X_test)

## Modelling

### Naive Bayes

In [46]:
from sklearn.naive_bayes import BernoulliNB

BNBclf = BernoulliNB()
BNBclf.fit(X_train_counts, y_train)

y_pred = BNBclf.predict(X_test_counts)

y_pred

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1])

In [47]:
metric = {'model': [], 
          'confusion_matrix' : [], 
          'auc' : [], 
          'accuracy' : []}

metric['model'].append('NaiveBayes')

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

conf_ = confusion_matrix(y_test, y_pred)

metric['confusion_matrix'].append(conf_)
conf_

array([[25, 18],
       [ 1, 36]])

In [50]:
_auc = roc_auc_score(y_test, y_pred)

metric['auc'].append(_auc)

_auc

0.7771841609050911

In [51]:
acc_ = accuracy_score(y_test, y_pred)

metric['accuracy'].append(acc_)

acc_

0.7625

### Logistics Regression

In [52]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_counts, y_train)

LogisticRegression()

In [58]:
y_pred = clf.predict(X_test_counts)

metric['model'].append("LogisticRegression")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))

Confusion Matrix: 
 [[35  8]
 [12 25]]
Area Under Curve (AUC): 0.7448
Accuracy : 0.75


### Support Vector Machine

In [63]:
from sklearn import svm

svclf = svm.SVC()
svclf.fit(X_train_counts, y_train)

SVC()

In [64]:
y_pred = svclf.predict(X_test_counts)

metric['model'].append("SVM")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))

Confusion Matrix: 
 [[35  8]
 [14 23]]
Area Under Curve (AUC): 0.7178
Accuracy : 0.725


### Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier()
rfclf.fit(X_train_counts, y_train)

RandomForestClassifier()

In [66]:
y_pred = rfclf.predict(X_test_counts)

metric['model'].append("RandomForest")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))

Confusion Matrix: 
 [[32 11]
 [ 9 28]]
Area Under Curve (AUC): 0.7505
Accuracy : 0.75
