# Text Classification Model

## Load Data

In [None]:
import pandas as pd
import pickle

In [None]:
dataset = pd.read_csv('dataset_tweet_sentimen_tayangan_tv.csv')

In [None]:
dataset

In [None]:
dataset['Acara TV'].unique()

In [None]:
dataset.groupby(['Acara TV','Sentiment'])['Id'].count().reset_index()

## Data Preparation & Pre-processing

In [None]:
dataset.isnull().sum()

In [None]:
import preprocessor as p

In [None]:
clean_text = []
for text in dataset['Text Tweet']:
    clean_text.append(p.clean(text))

In [None]:
lower_text=[]
for text in clean_text:
    lower_text.append(text.lower())

In [None]:
import re
def remove_punct(text):
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text

In [None]:
no_punct_text=[]
for text in lower_text:
    no_punct_text.append(remove_punct(text))

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
stop_factory = StopWordRemoverFactory()

In [None]:
more_stopword = [
    'lu','gua','yg'
]
data = stop_factory.get_stop_words()+more_stopword
dictionary = ArrayDictionary(data)
stopwords = StopWordRemover(dictionary)

In [None]:
pickle.dump(stopwords, open('stopwords.pkl', 'wb'))

In [None]:
no_stopwords_text=[]
for text in no_punct_text:
    no_stopwords_text.append(stopwords.remove(text))

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
pickle.dump(stemmer, open('stemmer.pkl', 'wb'))

In [None]:
stemmed_text = []
for text in no_stopwords_text:
    stemmed_text.append(stemmer.stem(text))

In [None]:
stemmed_text

In [None]:
dataset['cleaned_text'] = stemmed_text

In [None]:
token=[]
for text in stemmed_text:
    token.extend(text.split())

In [None]:
token

## Data Exploration

### All Document

In [None]:
import itertools
all_words = list(itertools.chain(token))

In [None]:
all_words

In [None]:
import collections
counts_words = collections.Counter(all_words)

In [None]:
counts_words

In [None]:
df_word_freq = pd.DataFrame(counts_words.most_common(30),
                             columns=['words', 'count'])

df_word_freq

### Negative Document

In [None]:
negative_doc = dataset[dataset['Sentiment']=="negative"]['cleaned_text']

In [None]:
token_neg=[]
for text in negative_doc:
    token_neg.extend(text.split())

In [None]:
all_words_neg = list(itertools.chain(token_neg))

In [None]:
all_words_neg

In [None]:
counts_words = collections.Counter(all_words_neg)

In [None]:
df_word_freq = pd.DataFrame(counts_words.most_common(50),
                             columns=['words', 'count'])

df_word_freq

### Positive Document

In [None]:
pos_doc = dataset[dataset['Sentiment']=="positive"]['cleaned_text']

In [None]:
token_pos=[]
for text in pos_doc:
    token_pos.extend(text.split())

In [None]:
all_words_pos = list(itertools.chain(token_pos))

In [None]:
counts_words = collections.Counter(all_words_pos)

In [None]:
df_word_freq = pd.DataFrame(counts_words.most_common(50),
                             columns=['words', 'count'])

df_word_freq

In [None]:
dataset['label'] = list(map(lambda x: 1 if x=="positive" else 0,dataset['Sentiment']))

## Feature Transformation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['cleaned_text'], dataset['label'], random_state = 0,test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [None]:
X_train_counts = tfidf.fit_transform(X_train)
X_test_counts = tfidf.transform(X_test)

In [None]:
pickle.dump(tfidf, open('feature_transformation.pkl', 'wb'))

In [None]:
X_test_counts

In [None]:
X_train_counts

## Modelling

### Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
BNBclf = BernoulliNB()
BNBclf.fit(X_train_counts, y_train)

In [None]:
pickle.dump(BNBclf, open('naive_bayes_classifier_model.pkl', 'wb'))

In [None]:
y_pred = BNBclf.predict(X_test_counts)

In [None]:
y_pred

In [None]:
import numpy as np
np.array(y_test)

In [None]:
metric = {"model":[],
         "confusion_matrix":[],
         "auc":[],
         "accuracy":[]}

In [None]:
metric['model'].append("NaiveBayes")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)
conf_

In [None]:
_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)
_auc

In [None]:
acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)
acc_

### Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_counts, y_train)

In [None]:
pickle.dump(clf, open('logistic_regression_classifier_model.pkl', 'wb'))

In [None]:
y_pred = clf.predict(X_test_counts)

In [None]:
metric['model'].append("LogisticRegression")

In [None]:
conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)
conf_

In [None]:
_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)
_auc

In [None]:
acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)
acc_

### Support Vector Machine

In [None]:
from sklearn import svm
svclf = svm.SVC()
svclf.fit(X_train_counts, y_train)

In [None]:
pickle.dump(svclf, open('SVM_classifier_model.pkl', 'wb'))

In [None]:
y_pred = svclf.predict(X_test_counts)

In [None]:
metric['model'].append("SVM")

In [None]:
conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)
conf_

In [None]:
_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)
_auc

In [None]:
acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)
acc_

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier()
rfclf.fit(X_train_counts, y_train)

In [None]:
pickle.dump(rfclf, open('random_forest_classifier_model.pkl', 'wb'))

In [None]:
y_pred = rfclf.predict(X_test_counts)

In [None]:
metric['model'].append("Random Forest")

In [None]:
conf_ = confusion_matrix(y_test, y_pred)
metric["confusion_matrix"].append(conf_)
conf_

In [None]:
_auc = roc_auc_score(y_test, y_pred)
metric["auc"].append(_auc)
_auc

In [None]:
acc_ = accuracy_score(y_test, y_pred)
metric["accuracy"].append(acc_)
acc_

In [None]:
pickle.dump(metric, open('classifier_model_metric.pkl', 'wb'))