# The impact of dimensionality reduction in sentiment analysis classification

## Initialization

In [1]:
# Basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Time library
import time

# Dataset manipulation/vectorization libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Text processing libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Dimensionality Reduction Algorithms
from sklearn.decomposition import TruncatedSVD

# Supervised classifiers.
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Evaluation libraries
from sklearn.metrics import classification_report

# Dimensionality Reduction algorithms
from sklearn.decomposition import TruncatedSVD


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Classifiers

In [2]:
models = [
        KNeighborsClassifier(10),
        LogisticRegression(max_iter=300),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=100),
        SVC(kernel='rbf', C = 1),
        MLPClassifier(activation='relu', hidden_layer_sizes=(128, 16), random_state=42),
]
models

[KNeighborsClassifier(n_neighbors=10),
 LogisticRegression(max_iter=300),
 DecisionTreeClassifier(),
 RandomForestClassifier(),
 SVC(C=1),
 MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42)]

## Dataset: IMDB


In [3]:
df = pd.read_csv("datasets\\movie_data.csv", encoding = "utf-8")

df = df.rename(columns={"review": "text", "sentiment": "source"}, errors='raise')

df.head(10)


Unnamed: 0,text,source
0,I finally managed to sit through a whole episo...,0
1,Just what the world needed-another superficial...,0
2,I have to say despite it's reviews Angels in t...,1
3,Only the chosen ones will appreciate the quali...,1
4,Anne (Natalie Portman) tells us about how much...,1
5,"When John Singleton is on, he's *on*!! And thi...",1
6,The `plot' of this film contains a few holes y...,0
7,I tried restarting the movie twice. I put it i...,0
8,The best Treasure Island ever made. They just ...,1
9,"Granted, HOTD 2 is better than the Uwe Boll cr...",0


## Text cleaning

In [4]:
def textcleaner_lemmas(text):
    ''' Takes in raw unformatted text and strips punctuation, removes whitespace,
    strips numbers, tokenizes and stems.
    Returns string of processed text to be used into CountVectorizer
    '''
    # Lowercase and strip everything except words
    cleaner = re.sub(r"[^a-zA-Z ]+", ' ', text.lower())
    # Tokenize
    cleaner = word_tokenize(cleaner)
    clean = []
    for w in cleaner:
        # filter out stopwords
        if w not in stopWords:
            # filter out short words
            if len(w)>2:
                # lemmatizer 
                clean.append(lemmatizer.lemmatize(w))
    return ' '.join(clean)


In [5]:
lemmatizer = WordNetLemmatizer()

stopWords = set(stopwords.words('english'))

t0 = time.time()
'''
label = []
for i in df['source']:
    if df['source'] == 1:
        label.append('positive')
    else:
        label.append('negative')
#    label.append(dataset.target_names[i])

df['label'] = label
'''
df['clean_text'] = df.text.apply(lambda x: textcleaner_lemmas(x))
t1 = time.time()

df.head()
print("Text cleaning duration:", t1 - t0)


Text cleaning duration: 53.50149989128113


# Text vectorization - Training/Test Set construction

In [6]:
X = df['clean_text']
y = df['source']

tfidf = TfidfVectorizer()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify = y)

t0 = time.time()
X_train_vec = tfidf.fit_transform(X_train)
#print("== TF-IDF VOCAB X_TRAIN =============================================================\n")
#print(tfidf.vocabulary_)

X_test_vec = tfidf.transform(X_test)
#print("== TF-IDF VOCAB X_TEST  =============================================================\n")
#print(tfidf.vocabulary_)

t1 = time.time()

print("Text vectorization duration:", t1 - t0)
print("Training set dimensionality: ", X_train_vec.shape)
print("Test set dimensionality: ", X_test_vec.shape)



Text vectorization duration: 3.6735000610351562
Training set dimensionality:  (35000, 77026)
Test set dimensionality:  (15000, 77026)


# Models on the original dataset

In [7]:
for clf in models:
    t0 = time.time()
    clf.fit(X_train_vec, y_train)
    t1 = time.time()
    print("=================================================================================================")
    print("\t === CLASSIFIER:", clf, "- TARGET SPACE: original")
    print("\t === Model training:", t1 - t0, "sec")

    y_predicted = clf.predict(X_test_vec)

    print("\t === Classification Report")
    print(classification_report(y_test, y_predicted))
    print("=================================================================================================")


	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: original
	 === Model training: 0.015000104904174805 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.80      0.78      0.79      7500
           1       0.78      0.80      0.79      7500

    accuracy                           0.79     15000
   macro avg       0.79      0.79      0.79     15000
weighted avg       0.79      0.79      0.79     15000

	 === CLASSIFIER: LogisticRegression(max_iter=300) - TARGET SPACE: original
	 === Model training: 1.5359997749328613 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      7500
           1       0.88      0.91      0.90      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

	 === CLASSIFIER: DecisionTreeClassifier()

# Models on the reduced dimensional spaces

In [8]:
reduced_spaces = [ 10, 100, 1000, 10000 ]
for target_space in reduced_spaces:
    print("========== Working with target space of", target_space, "dimensions ============")
    SVD = TruncatedSVD(n_components=target_space, random_state=42)
    X_train_red = SVD.fit_transform(X_train_vec)
    X_test_red = SVD.transform(X_test_vec)

    print("\tInput space dimensionality (training):", X_train_red.shape, X_train_vec.shape)
    print("\tInput space dimensionality (testing):", X_test_red.shape, X_test_vec.shape)

    for clf in models:
        t0 = time.time()
        clf.fit(X_train_red, y_train)
        t1 = time.time()
        print("=================================================================================================")
        print("\t === CLASSIFIER:", clf, "- TARGET SPACE:", target_space)
        print("\t === Model training:", t1 - t0, "sec")

        y_predicted = clf.predict(X_test_red)

        print("\t === Classification Report")
        print("\t", classification_report(y_test, y_predicted))
        print("=================================================================================================")


	Input space dimensionality (training): (35000, 10) (35000, 77026)
	Input space dimensionality (testing): (15000, 10) (15000, 77026)
	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: 10
	 === Model training: 0.11800026893615723 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.78      0.82      0.80      7500
           1       0.81      0.76      0.79      7500

    accuracy                           0.79     15000
   macro avg       0.79      0.79      0.79     15000
weighted avg       0.79      0.79      0.79     15000

	 === CLASSIFIER: LogisticRegression(max_iter=300) - TARGET SPACE: 10
	 === Model training: 0.045999765396118164 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.83      0.78      0.81      7500
           1       0.80      0.84      0.82      7500

    accuracy                           0.81     15000
   macro avg       0.81  



	 === CLASSIFIER: MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42) - TARGET SPACE: 100
	 === Model training: 112.2810001373291 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.82      0.85      0.83      7500
           1       0.84      0.81      0.82      7500

    accuracy                           0.83     15000
   macro avg       0.83      0.83      0.83     15000
weighted avg       0.83      0.83      0.83     15000

	Input space dimensionality (training): (35000, 1000) (35000, 77026)
	Input space dimensionality (testing): (15000, 1000) (15000, 77026)
	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: 1000
	 === Model training: 0.05299997329711914 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.64      0.71      0.67      7500
           1       0.67      0.60      0.63      7500

    accuracy                           0.65    

	 === CLASSIFIER: MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42) - TARGET SPACE: 10000
	 === Model training: 291.1710000038147 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.88      0.87      0.88      7500
           1       0.88      0.88      0.88      7500

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000

