# ML Model

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import pandas as pd
import joblib
import pickle

model_tweets = pd.read_csv('../data/model_ready_data.csv')
model_tweets = model_tweets.fillna("")
model_tweets.head()

Unnamed: 0,full_text,sentiment,hashtags,lang
0,pelosi airplane landed safely taiwan amp playi...,1,pelosi,en
1,hobipalooza laacademiaexpulsion weuro jhopeatl...,1,hobipalooza,en
2,v intro logo animation blue smoke looking prof...,1,yk,en
3,youre missing shes far taiwan couplegoals http...,0,taiwan,en
4,twitter make laugh scared pelosi taiwan visit,1,pelosi,en


In [3]:
model_tweets.shape

(3945, 4)

In [4]:
# 4492 1925
sentiment_analysis_tweet_data = model_tweets.copy(deep=True)
sentiment_analysis_tweet_data.drop(sentiment_analysis_tweet_data[sentiment_analysis_tweet_data['sentiment'] == -1].index, inplace=True)
sentiment_analysis_tweet_data.reset_index(drop=True, inplace=True)
tweet_train = sentiment_analysis_tweet_data.iloc[:4492,]
tweet_test = sentiment_analysis_tweet_data.iloc[4493:,]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [7]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(tweet_train['full_text'].values)

CountVectorizer()

In [8]:
X_train_unigram = unigram_vectorizer.transform(tweet_train['full_text'].values)

In [9]:
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)

TfidfTransformer()

In [10]:
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

In [11]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(tweet_train['full_text'].values)

CountVectorizer(ngram_range=(1, 2))

In [12]:
X_train_bigram = bigram_vectorizer.transform(tweet_train['full_text'].values)

In [13]:
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

TfidfTransformer()

In [14]:
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

In [15]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

In [16]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y,train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)

    global_vars = globals()
    if(valid_score > global_vars['best_score']):
        global_vars['best_model'] = clf
        global_vars['best_model_name'] = title
        global_vars['best_score'] = valid_score

    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [17]:
y_train = tweet_train['sentiment'].values
y_train

array([1, 1, 1, ..., 1, 1, 0])

In [18]:
best_model = ""
best_model_name = ""
best_score = 0

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 1.0 ; Validation score: 0.86

Unigram Tf-Idf
Train score: 1.0 ; Validation score: 0.86

Bigram Counts
Train score: 1.0 ; Validation score: 0.85

Bigram Tf-Idf
Train score: 1.0 ; Validation score: 0.86



In [19]:
print(f'The best Model is {best_model_name} with a Validation score of: {round(best_score, 2)}')

The best Model is Unigram Tf-Idf with a Validation score of: 0.86


# Topic Modeling

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/abel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
import re
import numpy as np
import pandas as  pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [26]:
topic_model_data = model_tweets.copy(deep=True)
topic_model_data

Unnamed: 0,full_text,sentiment,hashtags,lang
0,pelosi airplane landed safely taiwan amp playi...,1,pelosi,en
1,hobipalooza laacademiaexpulsion weuro jhopeatl...,1,hobipalooza,en
2,v intro logo animation blue smoke looking prof...,1,yk,en
3,youre missing shes far taiwan couplegoals http...,0,taiwan,en
4,twitter make laugh scared pelosi taiwan visit,1,pelosi,en
...,...,...,...,...
3940,american aggression stopped america wants war ...,0,taiwan,en
3941,america bully always instigates aggression mc ...,1,pelosi,en
3942,sudans military coup regime supported russias ...,1,sudan,en
3943,download youtube golden v apk android devices ...,1,youtube,en


In [28]:
def get_hastags_words_list():
    hashtagList = []
    for hashtags in topic_model_data.hashtags:
        if(hashtags != ""):
            hashtagList += hashtags.split(',')

    return hashtagList

hashtag = get_hastags_words_list()

data = [word for sentence in topic_model_data.full_text for word in sentence.split(' ')]

In [29]:
hashtag[:5]

['pelosi', 'hobipalooza', 'yk', 'taiwan', 'pelosi']

In [30]:
data[:10]

['pelosi',
 'airplane',
 'landed',
 'safely',
 'taiwan',
 'amp',
 'playing',
 'win',
 'win',
 'financial']

In [31]:
data_words = data + hashtag
data_words = [word for word in data_words if word != '']
data_words[:5]

['pelosi', 'airplane', 'landed', 'safely', 'taiwan']

In [32]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['p', 'e', 'l', 'o', 's', 'i']
