# Libraries

In [4]:
import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import random
import numpy                         as np
import pandas                        as pd

from sklearn.model_selection         import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from utils.load_dataset import *
from utils.utils import evaluate_model
from utils.text_preprocessing import *

In [5]:
random_state = 42

random.seed(random_state)
os.environ["PYTHONHASHSEED"] = str(random_state)
np.random.seed(random_state)

# Dataset

In [6]:
# Sentiment Labelled Sentences Data Set (UCI)
# This data set includes labeled reviews from 
# 1. IMDb, 
# 2. Amazon
# 3. Yelp. 
# Each review is marked with a score of 0 for a negative sentiment 
# or 1 for a positive sentiment.
df = load_dataset()

df.head(3)

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp


## Create train/test datasets

In [7]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=random_state)

Sentences_train = df_train['sentence']
Sentences_test = df_test['sentence']

## Preprocess data
1. Lower case
2. Remove punct
2. Remove stopping words
3. Lemmatization (or Stemming)

In [8]:
Sentences_train = preprocess(Series=Sentences_train, verbose=False)
Sentences_test = preprocess(Series=Sentences_test, verbose=False)

# Topic modeling using LDA

In [9]:
# Vectorization process
vectorizer = CountVectorizer(min_df = 2, max_df = 0.9)
vectorizer.fit(Sentences_train)

trainX = vectorizer.transform(Sentences_train)
testX  = vectorizer.transform(Sentences_test)

## Setup model & train

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=2, random_state=42)
LDA.fit(trainX)

LatentDirichletAllocation(n_components=2, random_state=42)

## Model evaluation

In [11]:
for i, topic in enumerate(LDA.components_):
    print(f'Top-15 words in Topic: {i}')
    print([vectorizer.get_feature_names()[idx] for idx in topic.argsort()[-15:]])
    print('\n')

Top-15 words in Topic: 0
['get', 'look', 'could', 'battery', 'sound', 'great', 'headset', 'ive', 'one', 'time', 'well', 'quality', 'work', 'phone', 'good']


Top-15 words in Topic: 1
['bad', 'go', 'would', 'back', 'time', 'best', 'dont', 'one', 'like', 'place', 'service', 'food', 'film', 'great', 'movie']




In [12]:
topic_results_train = LDA.transform(trainX)
topic_results_test = LDA.transform(testX)

df_train['topic'] = topic_results_train.argmax(axis=1)
df_test['topic'] = topic_results_test.argmax(axis=1)


print('Training dataset')
evaluate_model(df_train['label'], df_train['topic'])

print('Testing dataset')
evaluate_model(df_test['label'], df_test['topic'])

Training dataset
Accuracy = 50.51%
AUC      = 0.50449
F1       = 0.52262
[[579 630]
 [594 670]]


Testing dataset
Accuracy = 51.27%
AUC      = 0.52057
F1       = 0.51799
[[69 84]
 [50 72]]




# Topic modeling using Non-negative Matrix Factorization

In [13]:
# Vectorization process
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 2, max_df = 0.9)
vectorizer.fit(Sentences_train)

trainX = vectorizer.transform(Sentences_train)
testX  = vectorizer.transform(Sentences_test)

## Setup model & train

In [14]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=2, random_state=42)
nmf_model.fit(trainX)

NMF(n_components=2, random_state=42)

## Model evaluation

In [15]:
for i, topic in enumerate(nmf_model.components_):
    print(f'Top-15 words in Topic: {i}')
    print([vectorizer.get_feature_names()[idx] for idx in topic.argsort()[-15:]])
    print('\n')

Top-15 words in Topic: 0
['deal', 'movie', 'one', 'love', 'time', 'film', 'place', 'battery', 'product', 'price', 'food', 'service', 'work', 'phone', 'great']


Top-15 words in Topic: 1
['dont', 'product', 'film', 'bad', 'pretty', 'excellent', 'time', 'quality', 'really', 'movie', 'place', 'price', 'service', 'food', 'good']




In [16]:
topic_results_train = nmf_model.transform(trainX)
topic_results_test = nmf_model.transform(testX)

df_train['topic'] = topic_results_train.argmax(axis=1)
df_test['topic'] = topic_results_test.argmax(axis=1)


print('Training dataset')
evaluate_model(df_train['label'], df_train['topic'])

print('Testing dataset')
evaluate_model(df_test['label'], df_test['topic'])

Training dataset
Accuracy = 46.30%
AUC      = 0.45838
F1       = 0.55910
[[303 906]
 [422 842]]


Testing dataset
Accuracy = 44.73%
AUC      = 0.47088
F1       = 0.52201
[[ 40 113]
 [ 39  83]]


