Coding Assignment1 - Sentiment analysis

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import unidecode
import sys  
import contractions
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import tensorflow_hub as hub
import gensim.downloader as api
import concurrent.futures
from sklearn import svm

In [3]:
nlp = spacy.load("en_core_web_sm")
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/e.d.i.t.h/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/e.d.i.t.h/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

After importing the required libraries start reading the csv file dataset

In [5]:
df = pd.read_csv('dataset.csv')
df.columns.tolist()

['id',
 'text',
 'text_sentiment',
 'username',
 'hashtags',
 'created_at',
 'user followers count',
 'replycount',
 'retweetcount',
 'likecount',
 'quotecount',
 'language',
 'media',
 'retweetedTweet',
 'quotedtweet',
 'inReplyToTweetId',
 'inReplyToUser',
 'mentionedUsers']

Dropping the unwanted columns and cleaning the data

In [7]:
df = df.drop(['id','username', 'hashtags','created_at', 'user followers count', 'replycount','retweetcount','likecount', 'quotecount', 'language', 'media', 'retweetedTweet', 'quotedtweet', 'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers'], axis=1)
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,text_sentiment
0,when will the #nyse #stockmarketcrash happen?,Neutral
1,aaj ka gyan:\n\nif a company isn't a quality c...,Negative
2,the stock market needs to crash hard to make i...,Negative
3,"those who are ""buying on dip"" will very soon b...",Neutral
4,@rdrhwke i wish our so-called president were t...,Positive


In [8]:
df['text'] = df['text'].str.replace(f"[{string.punctuation}]", "", regex=True)
df.head()

Unnamed: 0,text,text_sentiment
0,when will the nyse stockmarketcrash happen,Neutral
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative
2,the stock market needs to crash hard to make i...,Negative
3,those who are buying on dip will very soon bec...,Neutral
4,rdrhwke i wish our socalled president were tra...,Positive


In [9]:
def basic_removal(text):
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text

In [10]:
df['text'] = df['text'].apply(basic_removal)
df.head()

Unnamed: 0,text,text_sentiment
0,when will the nyse stockmarketcrash happen,Neutral
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative
2,the stock market needs to crash hard to make i...,Negative
3,those who are buying on dip will very soon bec...,Neutral
4,rdrhwke i wish our socalled president were tra...,Positive


Removing Emojis

In [12]:
def remove_emoji(text):
    emoji_pattern=re.compile("["
                             u"\U0001F600-\U0001F64F" #emoticons
                             u"\U0001F300-\U0001F5FF" #symbols, pictograph
                              u"\U0001F680-\U0001F6FF" #transport and map symbol
                              u"\U0001F1E0-\U0001F1FF" # flags(IOS)
                              u"\U00002702-\U000027B0"
                              u"\U00002FC2-\U0001F251"
                             "]+",flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [13]:
df['clean_text'] = df['text'].apply(remove_emoji)
df.head()

Unnamed: 0,text,text_sentiment,clean_text
0,when will the nyse stockmarketcrash happen,Neutral,when will the nyse stockmarketcrash happen
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative,aaj ka gyan\n\nif a company isnt a quality com...
2,the stock market needs to crash hard to make i...,Negative,the stock market needs to crash hard to make i...
3,those who are buying on dip will very soon bec...,Neutral,those who are buying on dip will very soon bec...
4,rdrhwke i wish our socalled president were tra...,Positive,rdrhwke i wish our socalled president were tra...


Removing words with accents like è --> e

In [15]:
def remove_accents(keyword):
    cleaned = unidecode.unidecode(keyword)
    return cleaned

In [16]:
df['clean_text'] = df['text'].apply(remove_accents)
df.head()

Unnamed: 0,text,text_sentiment,clean_text
0,when will the nyse stockmarketcrash happen,Neutral,when will the nyse stockmarketcrash happen
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative,aaj ka gyan\n\nif a company isnt a quality com...
2,the stock market needs to crash hard to make i...,Negative,the stock market needs to crash hard to make i...
3,those who are buying on dip will very soon bec...,Neutral,those who are buying on dip will very soon bec...
4,rdrhwke i wish our socalled president were tra...,Positive,rdrhwke i wish our socalled president were tra...


To expand words like wouldn't to wouldnot

In [18]:
def expand_words(text):
    expanded_words = []    
    for word in text.split():
        # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))   
           
    expanded_text = ' '.join(expanded_words)
    return expanded_text

In [19]:
df['clean_text'] = df['clean_text'].apply(expand_words)
df.head(10)

Unnamed: 0,text,text_sentiment,clean_text
0,when will the nyse stockmarketcrash happen,Neutral,when will the nyse stockmarketcrash happen
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative,aaj ka gyan if a company is not a quality comp...
2,the stock market needs to crash hard to make i...,Negative,the stock market needs to crash hard to make i...
3,those who are buying on dip will very soon bec...,Neutral,those who are buying on dip will very soon bec...
4,rdrhwke i wish our socalled president were tra...,Positive,rdrhwke i wish our socalled president were tra...
5,paulkrugman paul you wrote the stockmarketcras...,Negative,paulkrugman paul you wrote the stockmarketcras...
6,checkmatey stockmarketcrash best explanation\...,Negative,checkmatey stockmarketcrash best explanation d...
7,stockmarketcrash best explanation\ndamn mask ...,Negative,stockmarketcrash best explanation damn mask cr...
8,stocks for fathers day\n\naapl\nko\npep\nhd\n...,Neutral,stocks for fathers day aapl ko pep hd low swk ...
9,this may push us toward ww \ngreatdepression s...,Negative,this may push us toward ww greatdepression sto...


Remove Stop Words

In [21]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/e.d.i.t.h/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def remove_stopwords(text):
    new_stop_words = ["banknifty","stocks","rdrhwke","stockmarket","stock market","tictoctick"]
    text = str(text).lower()
    stop_words = stopwords.words('english')
    # stop_words2 = stopwords.words('hinglish')
    stop_words.extend(new_stop_words)
    # stop_words.extend(stop_words2)
    stop_words = set(stop_words)
    new_text = []
    for word in text.split():
        if word in stop_words:
            new_text.append('')  # Replace stopword with empty string
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)


In [23]:
df['clean_text'] = df['clean_text'].apply(remove_stopwords)
df.head()

Unnamed: 0,text,text_sentiment,clean_text
0,when will the nyse stockmarketcrash happen,Neutral,nyse stockmarketcrash happen
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative,aaj ka gyan company quality company buy...
2,the stock market needs to crash hard to make i...,Negative,stock market needs crash hard make realist...
3,those who are buying on dip will very soon bec...,Neutral,buying dip soon become promoters comp...
4,rdrhwke i wish our socalled president were tra...,Positive,wish socalled president transitory mean ...


Tokenization and Lemmatization for converting the word to their root form

In [25]:
def tokenize_lemma(text):
    doc = nlp(text)
    # return doc
    doc = nlp(text)
    # Return only the lemmas of tokens that are not spaces or punctuation
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    return ' '.join(tokens)


df['clean_text_lemma'] = df['clean_text'].apply(tokenize_lemma)
df.head()

Unnamed: 0,text,text_sentiment,clean_text,clean_text_lemma
0,when will the nyse stockmarketcrash happen,Neutral,nyse stockmarketcrash happen,nyse stockmarketcrash happen
1,aaj ka gyan\n\nif a company isnt a quality com...,Negative,aaj ka gyan company quality company buy...,aaj ka gyan company quality company buy price ...
2,the stock market needs to crash hard to make i...,Negative,stock market needs crash hard make realist...,stock market need crash hard make realistic ca...
3,those who are buying on dip will very soon bec...,Neutral,buying dip soon become promoters comp...,buy dip soon become promoter company stockmark...
4,rdrhwke i wish our socalled president were tra...,Positive,wish socalled president transitory mean ...,wish socalle president transitory mean really ...


Splitting of dataset into training and testing sets. Have done twice one for bonus part which is USE.

In [27]:
X = df['clean_text_lemma']
y = df['text_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=65)

In [28]:
X = df['clean_text_lemma']
y = df['text_sentiment']
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X, y, test_size=0.2, stratify=y, random_state=65)

In [29]:
# Label encoding
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [30]:
# Bag of Words
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
X_test_counts = count_vec.transform(X_test)

chi2_selector_bow = SelectKBest(chi2, k=100)
X_train_counts = chi2_selector_bow.fit_transform(X_train_counts, y_train_encoded)
X_test_counts = chi2_selector_bow.transform(X_test_counts)

In [31]:
# TF-IDF
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

In [32]:
chi2_selector = SelectKBest(chi2, k=100)
X_train_chi2 = chi2_selector.fit_transform(X_train_tfidf, y_train_encoded)
X_test_chi2 = chi2_selector.transform(X_test_tfidf)

In [33]:
# Convert training and test data into sentence vectors # Encode sentences using USE
X_train_use = use_model(X_train_text)
X_test_use = use_model(X_test_text)

In [34]:
w2v_model = api.load('word2vec-google-news-300')

In [35]:
def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype='float32')
    n_words = 0
    
    for word in words:
        if word in model.key_to_index:
            n_words += 1
            feature_vector += model[word] 
    
    if n_words > 0:
        feature_vector /= n_words
    
    return feature_vector

In [36]:
num_features = w2v_model.vector_size
def process_document(text):
    words = word_tokenize(text)
    return average_word_vectors(words, w2v_model, num_features)

# Use concurrent.futures to parallelize the processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    X_train_w2v = list(executor.map(process_document, X_train))
    X_test_w2v = list(executor.map(process_document, X_test))

X_train_w2v = np.array(X_train_w2v)
X_test_w2v = np.array(X_test_w2v)

In [37]:
# GloVe
glove_model = api.load('glove-wiki-gigaword-100')

In [38]:
num_features_glove = glove_model.vector_size
X_train_glove = np.array([
    average_word_vectors(word_tokenize(text), glove_model, num_features_glove)
    for text in X_train
])
X_test_glove = np.array([
    average_word_vectors(word_tokenize(text), glove_model, num_features_glove)
    for text in X_test
])

In [39]:
# SVM Classifier - PRIMARY
svm_clf = svm.SVC(kernel='linear', random_state=55)

In [40]:
# Logistic Regression - SECONDARY
logreg_clf = LogisticRegression(random_state=42, max_iter=1000)

In [41]:
def train_evaluate(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    return accuracy

In [42]:
# Evaluate models with SVM
print("SVM Classifier Results:")

print("Results with Bag of Words:")
accuracy_bow_svm = train_evaluate(svm_clf, X_train_counts, y_train, X_test_counts, y_test)
print(f"Accuracy: {accuracy_bow_svm:.4f}\n")

print("Results with TF-IDF and Chi-Square Feature Selection:")
accuracy_tfidf_svm = train_evaluate(svm_clf, X_train_chi2, y_train, X_test_chi2, y_test)
print(f"Accuracy: {accuracy_tfidf_svm:.4f}\n")

print("Results with Word2Vec Embeddings:")
accuracy_w2v_svm = train_evaluate(svm_clf, X_train_w2v, y_train, X_test_w2v, y_test)
print(f"Accuracy: {accuracy_w2v_svm:.4f}\n")

print("Results with GloVe Embeddings:")
accuracy_glove_svm = train_evaluate(svm_clf, X_train_glove, y_train, X_test_glove, y_test)
print(f"Accuracy: {accuracy_glove_svm:.4f}\n")

SVM Classifier Results:
Results with Bag of Words:
              precision    recall  f1-score   support

    Negative       0.75      0.56      0.64      1981
     Neutral       0.63      0.93      0.75      2300
    Positive       0.83      0.65      0.73      2509

    accuracy                           0.72      6790
   macro avg       0.74      0.71      0.71      6790
weighted avg       0.74      0.72      0.71      6790

Confusion Matrix:
[[1104  603  274]
 [ 107 2131   62]
 [ 259  625 1625]]
Accuracy: 0.7158

Results with TF-IDF and Chi-Square Feature Selection:
              precision    recall  f1-score   support

    Negative       0.76      0.56      0.65      1981
     Neutral       0.65      0.93      0.77      2300
    Positive       0.83      0.68      0.75      2509

    accuracy                           0.73      6790
   macro avg       0.75      0.72      0.72      6790
weighted avg       0.75      0.73      0.72      6790

Confusion Matrix:
[[1115  575  291]
 [ 115

In [43]:
# Train and evaluate with USE sentence vectors
print("Results with Universal Sentence Encoder Embeddings:")
accuracy_use = train_evaluate(svm_clf, X_train_use, y_train_text, X_test_use, y_test_text)
print(f"Accuracy: {accuracy_use:.4f}\n")

Results with Universal Sentence Encoder Embeddings:
              precision    recall  f1-score   support

    Negative       0.65      0.64      0.64      1981
     Neutral       0.72      0.72      0.72      2300
    Positive       0.67      0.68      0.67      2509

    accuracy                           0.68      6790
   macro avg       0.68      0.68      0.68      6790
weighted avg       0.68      0.68      0.68      6790

Confusion Matrix:
[[1258  260  463]
 [ 249 1662  389]
 [ 430  381 1698]]
Accuracy: 0.6801



In [44]:
# Evaluate models with Logistic Regression
print("Results with Logistic Regression:")

print("Results with Bag of Words:")
accuracy_bow_logreg = train_evaluate(logreg_clf, X_train_counts, y_train, X_test_counts, y_test)
print(f"Accuracy: {accuracy_bow_logreg:.4f}\n")

print("Results with TF-IDF and Chi-Square Feature Selection:")
accuracy_tfidf_logreg = train_evaluate(logreg_clf, X_train_chi2, y_train, X_test_chi2, y_test)
print(f"Accuracy: {accuracy_tfidf_logreg:.4f}\n")

print("Results with Word2Vec Embeddings:")
accuracy_w2v_logreg = train_evaluate(logreg_clf, X_train_w2v, y_train, X_test_w2v, y_test)
print(f"Accuracy: {accuracy_w2v_logreg:.4f}\n")

print("Results with GloVe Embeddings:")
accuracy_glove_logreg = train_evaluate(logreg_clf, X_train_glove, y_train, X_test_glove, y_test)
print(f"Accuracy: {accuracy_glove_logreg:.4f}\n")

Results with Logistic Regression:
Results with Bag of Words:
              precision    recall  f1-score   support

    Negative       0.74      0.56      0.64      1981
     Neutral       0.65      0.91      0.76      2300
    Positive       0.80      0.67      0.73      2509

    accuracy                           0.72      6790
   macro avg       0.73      0.71      0.71      6790
weighted avg       0.73      0.72      0.71      6790

Confusion Matrix:
[[1105  546  330]
 [ 125 2089   86]
 [ 255  572 1682]]
Accuracy: 0.7181

Results with TF-IDF and Chi-Square Feature Selection:
              precision    recall  f1-score   support

    Negative       0.75      0.57      0.65      1981
     Neutral       0.67      0.91      0.77      2300
    Positive       0.81      0.70      0.75      2509

    accuracy                           0.73      6790
   macro avg       0.75      0.73      0.72      6790
weighted avg       0.75      0.73      0.73      6790

Confusion Matrix:
[[1125  526  3

In [45]:
# Train and evaluate with USE sentence vectors with Logistic Regression
print("Results with Universal Sentence Encoder Embeddings:")
accuracy_use = train_evaluate(logreg_clf, X_train_use, y_train_text, X_test_use, y_test_text)
print(f"Accuracy: {accuracy_use:.4f}\n")

Results with Universal Sentence Encoder Embeddings:
              precision    recall  f1-score   support

    Negative       0.65      0.63      0.64      1981
     Neutral       0.72      0.72      0.72      2300
    Positive       0.66      0.68      0.67      2509

    accuracy                           0.68      6790
   macro avg       0.68      0.68      0.68      6790
weighted avg       0.68      0.68      0.68      6790

Confusion Matrix:
[[1241  257  483]
 [ 251 1660  389]
 [ 427  382 1700]]
Accuracy: 0.6776

