# The impact of dimensionality reduction in sentiment analysis classification

## Initialization

In [1]:
# Basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Time library
import time

# Dataset manipulation/vectorization libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Text processing libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Dimensionality Reduction Algorithms
from sklearn.decomposition import TruncatedSVD

# Supervised classifiers.
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Evaluation libraries
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Classifiers

In [2]:
models = [
        KNeighborsClassifier(10),
        LogisticRegression(max_iter=300),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=100),
        SVC(kernel='rbf', C = 1),
        MLPClassifier(activation='relu', hidden_layer_sizes=(128, 16), random_state=42),
]
models

[KNeighborsClassifier(n_neighbors=10),
 LogisticRegression(max_iter=300),
 DecisionTreeClassifier(),
 RandomForestClassifier(),
 SVC(C=1),
 MLPClassifier(hidden_layer_sizes=(128, 16), random_state=42)]

## Dataset: Sentiment140


In [3]:
df = pd.read_csv("datasets\\sentiment140.csv", encoding='latin-1', header=None)

df = df.rename(columns={5: "text", 0: "source"}, errors='raise')

df.head(10)


Unnamed: 0,source,1,2,3,4,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


## Text cleaning

In [4]:
def textcleaner_lemmas(text):
    ''' Takes in raw unformatted text and strips punctuation, removes whitespace,
    strips numbers, tokenizes and stems.
    Returns string of processed text to be used into CountVectorizer
    '''
    # Lowercase and strip everything except words
    cleaner = re.sub(r"[^a-zA-Z ]+", ' ', text.lower())
    # Tokenize
    cleaner = word_tokenize(cleaner)
    clean = []
    for w in cleaner:
        # filter out stopwords
        if w not in stopWords:
            # filter out short words
            if len(w)>2:
                # lemmatizer 
                clean.append(lemmatizer.lemmatize(w))
    return ' '.join(clean)


In [5]:
lemmatizer = WordNetLemmatizer()

stopWords = set(stopwords.words('english'))

t0 = time.time()
'''
label = []
for i in df['source']:
    if df['source'] == 1:
        label.append('positive')
    else:
        label.append('negative')
#    label.append(dataset.target_names[i])

df['label'] = label
'''
df['clean_text'] = df.text.apply(lambda x: textcleaner_lemmas(x))
t1 = time.time()

df.head()
print("Text cleaning duration:", t1 - t0)


Text cleaning duration: 258.8762502670288


# Text vectorization - Training/Test Set construction

In [6]:
X = df['clean_text']
y = df['source']

tfidf = TfidfVectorizer()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify = y)

t0 = time.time()
X_train_vec = tfidf.fit_transform(X_train)
#print("== TF-IDF VOCAB X_TRAIN =============================================================\n")
#print(tfidf.vocabulary_)

X_test_vec = tfidf.transform(X_test)
#print("== TF-IDF VOCAB X_TEST  =============================================================\n")
#print(tfidf.vocabulary_)

t1 = time.time()

print("Text vectorization duration:", t1 - t0)
print("Training set dimensionality: ", X_train_vec.shape)
print("Test set dimensionality: ", X_test_vec.shape)


Text vectorization duration: 18.882500171661377
Training set dimensionality:  (1120000, 455257)
Test set dimensionality:  (480000, 455257)


# Models on the original dataset

In [None]:
for clf in models:
    t0 = time.time()
    clf.fit(X_train_vec, y_train)
    t1 = time.time()
    print("=================================================================================================")
    print("\t === CLASSIFIER:", clf, "- TARGET SPACE: original")
    print("\t === Model training:", t1 - t0, "sec")

    y_predicted = clf.predict(X_test_vec)

    print("\t === Classification Report")
    print(classification_report(y_test, y_predicted))
    print("=================================================================================================")


	 === CLASSIFIER: KNeighborsClassifier(n_neighbors=10) - TARGET SPACE: original
	 === Model training: 0.12750005722045898 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.57      0.76      0.65    240000
           4       0.64      0.42      0.51    240000

    accuracy                           0.59    480000
   macro avg       0.60      0.59      0.58    480000
weighted avg       0.60      0.59      0.58    480000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	 === CLASSIFIER: LogisticRegression(max_iter=300) - TARGET SPACE: original
	 === Model training: 59.36199975013733 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.79      0.76      0.78    240000
           4       0.77      0.80      0.78    240000

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000

	 === CLASSIFIER: DecisionTreeClassifier() - TARGET SPACE: original
	 === Model training: 6551.956000089645 sec
	 === Classification Report
              precision    recall  f1-score   support

           0       0.71      0.72      0.71    240000
           4       0.72      0.70      0.71    240000

    accuracy                           0.71    480000
   macro avg       0.71      0.71      0.71    480000
weighted avg       0.71      0.71      0.71    480000

	 === CLASSIFIER: RandomForestClassifier() - TARGET SPACE:

# Models on the reduced dimensional spaces

In [None]:
reduced_spaces = [ 10, 100, 1000, 10000 ]
for target_space in reduced_spaces:
    print("========== Working with target space of", target_space, "dimensions ============")
    SVD = TruncatedSVD(n_components=target_space, random_state=42)
    X_train_red = SVD.fit_transform(X_train_vec)
    X_test_red = SVD.transform(X_test_vec)

    print("\tInput space dimensionality (training):", X_train_red.shape, X_train_vec.shape)
    print("\tInput space dimensionality (testing):", X_test_red.shape, X_test_vec.shape)

    for clf in models:
        t0 = time.time()
        clf.fit(X_train_red, y_train)
        t1 = time.time()
        print("=================================================================================================")
        print("\t === CLASSIFIER:", clf, "- TARGET SPACE:", target_space)
        print("\t === Model training:", t1 - t0, "sec")

        y_predicted = clf.predict(X_test_red)

        print("\t === Classification Report")
        print("\t", classification_report(y_test, y_predicted))
        print("=================================================================================================")
