In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegressionCV as logreg
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline

pd.set_option('display.max_colwidth', 255)
plt.style.use("dark_background")

In [2]:
poli_pos = pd.read_csv('./data/poli_pos.csv')
poli_neg = pd.read_csv('./data/poli_neg.csv')
poli_val = pd.read_csv('./data/poli_val.csv')
topics_neg = pd.read_csv('./data/topics_neg.csv')
topics_pos = pd.read_csv('./data/topics_pos.csv')

In [3]:
cvec = CountVectorizer()

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
tvec = TfidfVectorizer()

In [6]:
poli_neg['target'] = 0 
poli_pos['target'] = 1 
topics_neg['target'] = 0
topics_pos['target'] = 1

In [7]:
print(poli_neg.columns)
print(poli_pos.columns)
print(topics_neg.columns)
print(topics_pos.columns)

Index(['Unnamed: 0', 'datetime', 'tweet_id', 'text', 'user_name', 'target'], dtype='object')
Index(['Unnamed: 0', 'datetime', 'tweet_id', 'text', 'user_name', 'target'], dtype='object')
Index(['Unnamed: 0', 'datetime', 'tweet_id', 'text', 'user_name', 'target'], dtype='object')
Index(['Unnamed: 0', 'datetime', 'tweet_id', 'text', 'keyword', 'target'], dtype='object')


In [8]:
# note: I will have to exclude the keyword column until I have added appropriate
# keywords throughout the data.  Right now there are too few, and they are too predictive.
poli_neg.drop(columns=['Unnamed: 0', 'tweet_id', 'user_name', 'datetime'], inplace=True)
poli_pos.drop(columns=['Unnamed: 0', 'tweet_id', 'user_name', 'datetime'], inplace=True)
topics_neg.drop(columns=['Unnamed: 0', 'tweet_id', 'user_name', 'datetime'], inplace=True)
topics_pos.drop(columns=['Unnamed: 0', 'tweet_id', 'datetime', 'keyword'], inplace=True)

In [9]:
df_topics = pd.concat([topics_pos, topics_neg], ignore_index=True)
df_poli = pd.concat([poli_pos, poli_neg], ignore_index=True)

In [10]:
X_poli = df_poli.drop(columns=['target'])
y_poli = df_poli['target']
X_topics = df_topics.drop(columns=['target'])
y_topics = df_topics['target']

In [11]:
# Split the data into the training and testing sets.
X_train_poli, X_test_poli, y_train_poli, y_test_poli = train_test_split(X_poli,
                                                                        y_poli,
                                                                        test_size=0.2,
                                                                        stratify=y_poli,
                                                                        random_state=42)

X_train_topics, X_test_topics, y_train_topics, y_test_topics = train_test_split(X_topics,
                                                                                y_topics,
                                                                                test_size=0.2,
                                                                                stratify=y_topics,
                                                                                random_state=42)

In [12]:
# custom stopwords to manually zero in on more predictive words

custom = ['https', 'trump', 'realdonaldtrump', 'rt', 'health', 'country', 
          'just', 'help', 'thank', 'time', 'senate', 'http', 'american', 
          'americans', 'act', 'vote', 'pandemic', 'like', 'america', 'state', 
          'support', 'day', 'workers', 'right', 'years', 'working', 'good', 
          'thanks', 'families', 'covid', 'crisis', 'election', 'big', 
          'congress', 'let', 'want', 'national', 'going', 'know', 'relief',  
          'house', 'public', 'year', 'federal', 'continue', 've', 
          'states', 'justice', 'way', 'ensure', 'jobs', 'law', 'businesses', 
          'proud', 'administration', 'small', 'world', 'stop', 'job', 'safe',
          'best', 'important', 'biden', 'doing', 'economy', 'better', 'needs',
          'week', 'join', 'funding', 'forward', 'economic']


combined_words = text.ENGLISH_STOP_WORDS.union(custom)

In [13]:
# implement countvectorizer
cvec = CountVectorizer(analyzer="word",
                       encoding='utf-8',
                       decode_error='ignore',
                       strip_accents='unicode',
                       lowercase=True,
                       max_df=0.95,
                       min_df=0.01,
                       stop_words=combined_words,
                       max_features=5000,
                       ngram_range=(1, 3))

In [28]:
X_train_topics.isna().sum()

text    0
dtype: int64

In [29]:
X_test_topics.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_topics.dropna(inplace=True)


In [30]:
# apply cvec to train and test sets
poli_X = pd.DataFrame(cvec.fit_transform(X_train_poli['text']).todense(),
                      columns=cvec.get_feature_names())

poli_X_test = pd.DataFrame(cvec.fit_transform(X_test_poli['text']).todense(),
                           columns=cvec.get_feature_names())

In [31]:
topics_X = pd.DataFrame(cvec.fit_transform(X_train_topics['text']).todense(),
                      columns=cvec.get_feature_names())

topics_X_test = pd.DataFrame(cvec.fit_transform(X_test_topics['text']).todense(),
                           columns=cvec.get_feature_names())

In [32]:
X_train_topics.dropna(subset=['text'], inplace=True)
X_test_topics.dropna(subset=['text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_topics.dropna(subset=['text'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_topics.dropna(subset=['text'], inplace=True)


In [33]:
# # re-add the username and datetime columns to the temporary df, then replace the original with the vectorized version

# X_train_poli = pd.concat([poli_X, X_train_poli['datetime']], axis=1)
# X_test_poli = pd.concat([poli_X_test, X_test_poli['datetime']], axis=1)
# X_train_topics = pd.concat([topics_X, X_train_topics['datetime']], axis=1)
# X_test_topics = pd.concat([topics_X_test, X_test_topics['datetime']], axis=1)

In [34]:
l1 = logreg(cv=5,
            penalty='l1',
            scoring=None,
            solver='liblinear',
            tol=0.001,
            max_iter=200,
            class_weight='balanced',
            n_jobs=-2,
            verbose=1,
            refit=True,
            intercept_scaling=1.0,
            multi_class='auto',
            random_state=42,
            l1_ratios=None,)

l2 = logreg(cv=5,
            penalty='l2',
            scoring=None,
            solver='lbfgs',
            tol=0.001,
            max_iter=200,
            class_weight='balanced',
            n_jobs=-2,
            verbose=1,
            refit=True,
            intercept_scaling=1.0,
            multi_class='auto',
            random_state=42,
            l1_ratios=None,)

lr_net = logreg(cv=5,
                penalty='elasticnet',
                scoring=None,
                solver='saga',
                tol=0.001,
                max_iter=200,
                class_weight='balanced',
                n_jobs=-2,
                verbose=1,
                refit=True,
                intercept_scaling=1.0,
                multi_class='auto',
                random_state=42,
                l1_ratios=.5,)

In [39]:
poli_sc = StandardScaler()

poli_pipe1 = Pipeline([
    ('scaler', poli_sc),
    ('model', l1)
])

Establishing null model:

In [40]:
y_train_poli.value_counts(normalize=True)

1    0.50144
0    0.49856
Name: target, dtype: float64

In [41]:
y_test_poli.value_counts(normalize=True)

1    0.501418
0    0.498582
Name: target, dtype: float64

In [42]:
poli_pipe1.fit(poli_X, y_train_poli)

poli_train1 = poli_pipe1.score(poli_X, y_train_poli)
poli_test1 = poli_pipe1.score(poli_X_test, y_test_poli)
print(f'LogReg L1, score on training set: {poli_train1}, score on test set: {poli_test1}.')

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:   17.6s remaining:   26.5s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   18.8s finished


ValueError: X has 122 features, but this StandardScaler is expecting 116 features as input.

In [None]:
poli_X.columns

In [None]:
set(zip(poli_X.columns, l1.coef_))

In [None]:
poli_X.sum().sort_values(ascending=False).head(50)

In [None]:
# plot top occuring words
poli_X.sum().sort_values(ascending=False).head(50).plot(kind='barh', figsize=(13,13));

In [None]:
poli_pipe2 = Pipeline([
    ('scaler', poli_sc),
    ('model', l2)
])

In [None]:
poli_pipe2.fit(poli_X, y_train_poli)

poli_train2 = poli_pipe2.score(poli_X, y_train_poli)
poli_test2 = poli_pipe2.score(poli_X_test, y_test_poli)
print(f'LogReg L2, score on training set: {poli_train2}, score on test set: {poli_test2}.')

In [None]:
set(zip(poli_X.columns, l2.coef_))

In [None]:
poli_pipe3 = Pipeline([
    ('scaler', poli_sc),
    ('model', lr_net)
])

In [None]:
poli_pipe3.fit(poli_X, y_train_poli)

poli_train3 = poli_pipe3.score(poli_X, y_train_poli)
poli_test3 = poli_pipe3.score(poli_X_test, y_test_poli)
print(f'LogReg elasticnet, score on training set: {poli_train3}, score on test set: {poli_test3}.')

In [None]:
set(zip(poli_X.columns, lr_net.coef_))

In [None]:
poli_cvec = pd.DataFrame(cvec.fit_transform(df_poli['text']).todense(), 
                         columns=cvec.get_feature_names())

poli_cvec.sum().sort_values(ascending=False).head(10).plot(kind='barh');

In [None]:
# convert training data to dataframe
poli_tiff = pd.DataFrame(tvec.fit_transform(df_poli['text']).todense(), 
                          columns=tvec.get_feature_names())

# plot top occuring words
poli_tiff.sum().sort_values(ascending=False).head(10).plot(kind='barh');