# The impact of dimensionality reduction in sentiment analysis classification

## Initialization

In [1]:
# Basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gzip
import json

# Time library
import time

# Dataset manipulation/vectorization libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Text processing libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Dimensionality Reduction Algorithms
from sklearn.decomposition import TruncatedSVD

# Supervised classifiers.
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Evaluation libraries
from sklearn.metrics import classification_report

from sklearn import preprocessing


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Classifiers

In [2]:
models = [
        KNeighborsClassifier(10),
        LogisticRegression(max_iter=300),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=100),
        SVC(kernel='rbf', C = 1),
        MLPClassifier(activation='relu', hidden_layer_sizes=(128, 16), random_state=42),
]
models

[KNeighborsClassifier(n_neighbors=10),
 LogisticRegression(max_iter=300),
 DecisionTreeClassifier(),
 RandomForestClassifier(),
 SVC(C=1),
 MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42)]

## Dataset: Amazon Product Reviews


In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('datasets\\Amazon_reviews_Office_Products_5.json.gz')

df = df.rename(columns={"reviewText": "text", "overall": "source"}, errors='raise')

le = preprocessing.LabelEncoder()
df['source'] = le.fit_transform(df['source'])

df.head(10)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,text,source,summary,unixReviewTime,reviewTime
0,A32T2H8150OJLU,B00000JBLH,ARH,"[3, 4]","I bought my first HP12C in about 1984 or so, a...",4,"A solid performer, and long time friend",1094169600,"09 3, 2004"
1,A3MAFS04ZABRGO,B00000JBLH,"Let it Be ""Alan""","[7, 9]",WHY THIS BELATED REVIEW? I feel very obliged t...,4,"Price of GOLD is up, so don't bury the golden ...",1197676800,"12 15, 2007"
2,A1F1A0QQP2XVH5,B00000JBLH,Mark B,"[3, 3]",I have an HP 48GX that has been kicking for mo...,1,"Good functionality, but not durable like old HPs",1293840000,"01 1, 2011"
3,A49R5DBXXQDE5,B00000JBLH,R. D Johnson,"[7, 8]",I've started doing more finance stuff recently...,4,One of the last of an almost extinct species,1145404800,"04 19, 2006"
4,A2XRMQA6PJ5ZJ8,B00000JBLH,Roger J. Buffington,"[0, 0]",For simple calculations and discounted cash fl...,4,Still the best,1375574400,"08 4, 2013"
5,A2JFOHC9W629IE,B00000JBLH,scott_from_dallas,"[10, 12]","While I don't have an MBA, it's hard to believ...",4,Every MBA student and grad should get one,1011744000,"01 23, 2002"
6,A38NELQT98S4H8,B00000JBLH,W. B. Halper,"[3, 4]",I've had an HP 12C ever since they were first ...,4,A workhorse of a calculator,1168992000,"01 17, 2007"
7,AA8M6331NI1EN,B00000JBLH,ZombieMom,"[0, 0]",Bought this for my boss because he lost his. ...,4,Fast shipping & great price for this awesome c...,1384387200,"11 14, 2013"
8,A25C2M3QF9G7OQ,B00000JBLU,Comdet,"[3, 3]","This is a well-designed, simple calculator tha...",4,"Nice design, works well, great value",1291680000,"12 7, 2010"
9,A1RTVWTWZSIC94,B00000JBLU,"Hb ""Black Beauty""","[0, 0]","I love this calculator, big numbers and calcul...",4,Love It!!!!!!!!!,1385942400,"12 2, 2013"


## Text cleaning

In [4]:
def textcleaner_lemmas(text):
    ''' Takes in raw unformatted text and strips punctuation, removes whitespace,
    strips numbers, tokenizes and stems.
    Returns string of processed text to be used into CountVectorizer
    '''
    # Lowercase and strip everything except words
    cleaner = re.sub(r"[^a-zA-Z ]+", ' ', text.lower())
    # Tokenize
    cleaner = word_tokenize(cleaner)
    clean = []
    for w in cleaner:
        # filter out stopwords
        if w not in stopWords:
            # filter out short words
            if len(w)>2:
                # lemmatizer 
                clean.append(lemmatizer.lemmatize(w))
    return ' '.join(clean)


In [5]:
lemmatizer = WordNetLemmatizer()

stopWords = set(stopwords.words('english'))

t0 = time.time()
'''
label = []
for i in df['source']:
    if df['source'] == 1:
        label.append('positive')
    else:
        label.append('negative')
#    label.append(dataset.target_names[i])

df['label'] = label
'''
df['clean_text'] = df.text.apply(lambda x: textcleaner_lemmas(x))
t1 = time.time()

df.head()
print("Text cleaning duration:", t1 - t0)


Text cleaning duration: 39.274500131607056


# Text vectorization - Training/Test Set construction

In [6]:
X = df['clean_text']
y = df['source']

tfidf = TfidfVectorizer()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify = y)

t0 = time.time()
X_train_vec = tfidf.fit_transform(X_train)
#print("== TF-IDF VOCAB X_TRAIN =============================================================\n")
#print(tfidf.vocabulary_)

X_test_vec = tfidf.transform(X_test)
#print("== TF-IDF VOCAB X_TEST  =============================================================\n")
#print(tfidf.vocabulary_)

t1 = time.time()

print("Text vectorization duration:", t1 - t0)
print("Training set dimensionality: ", X_train_vec.shape)
print("Test set dimensionality: ", X_test_vec.shape)


Text vectorization duration: 2.567000150680542
Training set dimensionality:  (37280, 35229)
Test set dimensionality:  (15978, 35229)


# Models on the original dataset

In [7]:
for clf in models:
    t0 = time.time()
    clf.fit(X_train_vec, y_train)
    t1 = time.time()
    print("=================================================================================================")
    print("\t === CLASSIFIER:", clf, "- TARGET SPACE: original")
    print("\t === Model training:", t1 - t0, "sec")

    y_predicted = clf.predict(X_test_vec)

    print("\t === Classification Report")
    print(classification_report(y_test, y_predicted))
    print("=================================================================================================")


	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: original
	 === Model training: 0.012999773025512695 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.75      0.01      0.02       339
           1       0.12      0.00      0.00       518
           2       0.18      0.01      0.02      1518
           3       0.28      0.78      0.42      4505
           4       0.59      0.23      0.33      9098

    accuracy                           0.35     15978
   macro avg       0.39      0.21      0.16     15978
weighted avg       0.45      0.35      0.31     15978

	 === CLASSIFIER: LogisticRegression(max_iter=300) - TARGET SPACE: original
	 === Model training: 16.731000185012817 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.65      0.20      0.31       339
           1       0.34      0.02      0.04       518
           2       0.41      0.14      0.21 

# Models on the reduced dimensional spaces

In [8]:
reduced_spaces = [ 10, 100, 1000, 10000 ]
for target_space in reduced_spaces:
    print("========== Working with target space of", target_space, "dimensions ============")
    SVD = TruncatedSVD(n_components=target_space, random_state=42)
    X_train_red = SVD.fit_transform(X_train_vec)
    X_test_red = SVD.transform(X_test_vec)

    print("\tInput space dimensionality (training):", X_train_red.shape, X_train_vec.shape)
    print("\tInput space dimensionality (testing):", X_test_red.shape, X_test_vec.shape)

    for clf in models:
        t0 = time.time()
        clf.fit(X_train_red, y_train)
        t1 = time.time()
        print("=================================================================================================")
        print("\t === CLASSIFIER:", clf, "- TARGET SPACE:", target_space)
        print("\t === Model training:", t1 - t0, "sec")

        y_predicted = clf.predict(X_test_red)

        print("\t === Classification Report")
        print("\t", classification_report(y_test, y_predicted))
        print("=================================================================================================")


	Input space dimensionality (training): (37280, 10) (37280, 35229)
	Input space dimensionality (testing): (15978, 10) (15978, 35229)
	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: 10
	 === Model training: 0.1399998664855957 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.14      0.01      0.02       339
           1       0.03      0.00      0.00       518
           2       0.14      0.03      0.05      1518
           3       0.32      0.25      0.28      4505
           4       0.59      0.79      0.67      9098

    accuracy                           0.52     15978
   macro avg       0.24      0.22      0.20     15978
weighted avg       0.44      0.52      0.47     15978

	 === CLASSIFIER: LogisticRegression(max_iter=300) - TARGET SPACE: 10
	 === Model training: 1.4800000190734863 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	 === CLASSIFIER: DecisionTreeClassifier() - TARGET SPACE: 10
	 === Model training: 0.6499998569488525 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.07      0.08      0.08       339
           1       0.04      0.05      0.05       518
           2       0.10      0.10      0.10      1518
           3       0.30      0.31      0.30      4505
           4       0.59      0.57      0.58      9098

    accuracy                           0.42     15978
   macro avg       0.22      0.22      0.22     15978
weighted avg       0.43      0.42      0.43     15978

	 === CLASSIFIER: RandomForestClassifier() - TARGET SPACE: 10
	 === Model training: 12.130000114440918 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       1.00      0.01      0.03       339
           1       0.67      0.00      0.01       518
           2       0.17      0.01      0.01      1518
           3       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	 === CLASSIFIER: MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42) - TARGET SPACE: 10
	 === Model training: 62.75474977493286 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.00      0.00      0.00       339
           1       0.00      0.00      0.00       518
           2       0.00      0.00      0.00      1518
           3       0.40      0.08      0.13      4505
           4       0.58      0.96      0.72      9098

    accuracy                           0.57     15978
   macro avg       0.20      0.21      0.17     15978
weighted avg       0.44      0.57      0.45     15978

	Input space dimensionality (training): (37280, 100) (37280, 35229)
	Input space dimensionality (testing): (15978, 100) (15978, 35229)
	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: 100
	 === Model training: 0.008999824523925781 sec
	 === Classification Report
	               precision    recall  f1-score   suppor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	 === CLASSIFIER: DecisionTreeClassifier() - TARGET SPACE: 100
	 === Model training: 5.358999967575073 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.09      0.09      0.09       339
           1       0.06      0.06      0.06       518
           2       0.13      0.13      0.13      1518
           3       0.32      0.33      0.32      4505
           4       0.62      0.60      0.61      9098

    accuracy                           0.45     15978
   macro avg       0.24      0.24      0.24     15978
weighted avg       0.46      0.45      0.45     15978

	 === CLASSIFIER: RandomForestClassifier() - TARGET SPACE: 100
	 === Model training: 35.11399984359741 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       1.00      0.02      0.03       339
           1       1.00      0.00      0.01       518
           2       0.67      0.01      0.01      1518
           3       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	 === CLASSIFIER: MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42) - TARGET SPACE: 100
	 === Model training: 70.48000001907349 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.28      0.22      0.25       339
           1       0.17      0.07      0.10       518
           2       0.25      0.16      0.20      1518
           3       0.38      0.41      0.39      4505
           4       0.68      0.72      0.70      9098

    accuracy                           0.55     15978
   macro avg       0.35      0.32      0.33     15978
weighted avg       0.53      0.55      0.53     15978

	Input space dimensionality (training): (37280, 1000) (37280, 35229)
	Input space dimensionality (testing): (15978, 1000) (15978, 35229)
	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: 1000
	 === Model training: 0.04999995231628418 sec
	 === Classification Report
	               precision    recall  f1-score   sup

	 === CLASSIFIER: RandomForestClassifier() - TARGET SPACE: 10000
	 === Model training: 429.5810000896454 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       1.00      0.01      0.03       339
           1       1.00      0.00      0.01       518
           2       0.80      0.00      0.01      1518
           3       0.36      0.02      0.04      4505
           4       0.57      0.99      0.72      9098

    accuracy                           0.57     15978
   macro avg       0.75      0.21      0.16     15978
weighted avg       0.56      0.57      0.43     15978

	 === CLASSIFIER: SVC(C=1) - TARGET SPACE: 10000
	 === Model training: 16606.861500024796 sec
	 === Classification Report
	               precision    recall  f1-score   support

           0       0.77      0.13      0.22       339
           1       0.33      0.00      0.00       518
           2       0.45      0.05      0.09      1518
           3       0.49      0.