# Text Classification Model

## Load Data

In [None]:
import pandas as pd

dataset = pd.read_csv('../data/dataset_tweet_sentimen_tayangan_tv.csv')

dataset

In [None]:
dataset['Acara TV'].unique()

In [None]:
dataset.groupby(['Acara TV', 'Sentiment'])['Id'].count()

## Data Preparation & Pre-processing

In [None]:
dataset.isnull().sum()

In [None]:
import preprocessor as p

clean_text = []

for text in dataset['Text Tweet']:
    clean_text.append(p.clean(text))

### Lowercasing

Mengubah semua huruf menjadi huruf kecil semua untuk mengurangi variansi data.

In [None]:
lower_text=[]

for text in clean_text:
    lower_text.append(text.lower())

In [None]:
import re

def remove_punct(text):
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text

In [None]:
no_punct_text=[]

for text in lower_text:
    no_punct_text.append(remove_punct(text))

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, ArrayDictionary, StopWordRemover

# membuat object untuk menghilankan stopwords
stop_factory = StopWordRemoverFactory()

# membuat objek stopwords
stopword = stop_factory.create_stop_word_remover()

# membuat list kosong untuk menyimpan hasil
no_stopwords_text = []

# membuit loop untuk menghilangkan stopwords
for text in no_punct_text:
    no_stopwords_text.append(stopword.remove(text))

# melihat hasil
no_stopwords_text[0]

In [None]:
# import library untuk mengembalikan ke dalam bentuk kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# membuat fungsi untuk stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat list kosong untuk menyimpan hasil
stemmed_text = []

# melakukan looping untuk melakukan stemming pada setiap elemen
for text in no_stopwords_text:
    stemmed_text.append(stemmer.stem(text))

# melihat hasil stemming
stemmed_text[0]

In [None]:
dataset['cleaned_text'] = stemmed_text

dataset.head()

In [None]:
token=[]

for text in stemmed_text:
    token.extend(text.split())

In [None]:
token

## Data Exploration

In [None]:
import itertools

all_words = list(itertools.chain(token))

all_words

In [None]:
import collections

count_words = collections.Counter(all_words)

count_words

In [None]:
df_word_freq = pd.DataFrame(count_words.most_common(30),
                            columns = ['words', 'count'])

df_word_freq

### Negative Document

In [None]:
negative_doc = dataset[dataset['Sentiment'] == 'negative']['cleaned_text']

token_neg = []

for text in negative_doc:
    token_neg.extend(text.split())

all_words_neg = list(itertools.chain(token_neg))

all_words_neg

In [None]:
count_words = collections.Counter(all_words_neg)

df_word_freq = pd.DataFrame(count_words.most_common(30), 
                            columns=['words', 'count'])

df_word_freq

### Positive Document

In [None]:
pos_doc = dataset[dataset['Sentiment']=='positive']['cleaned_text']

token_pos = []

for text in pos_doc:
    token_pos.extend(text.split())

all_words_pos = list(itertools.chain(token_pos))

count_words = collections.Counter(all_words_pos)

count_words

In [None]:
df_word_freq = pd.DataFrame(count_words.most_common(30), 
                            columns=['words', 'count'])

df_word_freq

In [None]:
dataset['label'] = list(map(lambda x: 1 if x == 'positive' else 0, dataset['Sentiment']))

## Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['cleaned_text'], 
                                                    dataset['label'],
                                                    random_state = 14,
                                                    test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_counts = tfidf.fit_transform(X_train)
X_test_counts = tfidf.transform(X_test)

## Modelling

### Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

BNBclf = BernoulliNB()
BNBclf.fit(X_train_counts, y_train)

y_pred = BNBclf.predict(X_test_counts)

y_pred

In [None]:
metric = {'model': [], 
          'confusion_matrix' : [], 
          'auc' : [], 
          'accuracy' : []}

metric['model'].append('NaiveBayes')

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

conf_ = confusion_matrix(y_test, y_pred)

metric['confusion_matrix'].append(conf_)
conf_

In [None]:
_auc = roc_auc_score(y_test, y_pred)

metric['auc'].append(_auc)

_auc

In [None]:
acc_ = accuracy_score(y_test, y_pred)

metric['accuracy'].append(acc_)

acc_

### Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_counts, y_train)

In [None]:
y_pred = clf.predict(X_test_counts)

metric['model'].append("LogisticRegression")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))

### Support Vector Machine

In [None]:
from sklearn import svm

svclf = svm.SVC()
svclf.fit(X_train_counts, y_train)

In [None]:
y_pred = svclf.predict(X_test_counts)

metric['model'].append("SVM")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier()
rfclf.fit(X_train_counts, y_train)

In [None]:
y_pred = rfclf.predict(X_test_counts)

metric['model'].append("RandomForest")


conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)

_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)

acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)

print('Confusion Matrix: \n {}'.format(conf_))
print('Area Under Curve (AUC): {:,.4f}'.format(_auc))
print('Accuracy : {}'.format(acc_))