# Model building

In [1]:
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.metrics import plot_roc_curve, precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../Twitter Data/GME_TWEETS_individual.csv")

In [3]:
print ( df.shape, df.columns)
df.head(2)

(72833, 11) Index(['author_id', 'text', 'id', 'Date', 'retweet', 'likes', 'reply', 'num',
       'image', 'video', 'gif'],
      dtype='object')


Unnamed: 0,author_id,text,id,Date,retweet,likes,reply,num,image,video,gif
0,1286967581337726981,GameStop $GME is more of a momentum play than ...,1342263553156284416,2020-12-25 00:00:00,0.0,9.0,1.0,1.0,0.0,0.0,0.0
1,914214567152160768,$GME GameStop Corporation Comm Top stock up 63...,1342264009693798401,2020-12-25 00:00:00,0.0,2.0,0.0,1.0,0.0,0.0,0.0


In [4]:
df.isna().sum()

author_id    0
text         0
id           1
Date         1
retweet      1
likes        1
reply        1
num          1
image        1
video        1
gif          2
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.describe()

Unnamed: 0,retweet,likes,reply,num,image,video,gif
count,72831.0,72831.0,72831.0,72831.0,72831.0,72831.0,72831.0
mean,2.793646,12.198226,1.148014,1.0,0.24782,0.017438,0.0
std,63.30858,177.063479,21.835789,0.0,0.43175,0.130896,0.0
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,3.0,0.0,1.0,0.0,0.0,0.0
max,10887.0,22552.0,4617.0,1.0,1.0,1.0,0.0


In [7]:
np.percentile(df.retweet, 95)

6.0

In [8]:
len(np.where(df.retweet>6)[0])

3185

## processing Tweet data

In [9]:
from prepare_text import prepare_text

In [10]:
df['text_processed'] = df['text'].apply(prepare_text)
df['popular'] = 0

In [11]:
np.percentile(df.retweet, 95)

6.0

In [12]:
df.loc[df['retweet'] > np.percentile(df.retweet, 95), 'popular' ] = 1

In [13]:
df.groupby('popular').count()

Unnamed: 0_level_0,author_id,text,id,Date,retweet,likes,reply,num,image,video,gif,text_processed
popular,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,69646,69646,69646,69646,69646,69646,69646,69646,69646,69646,69646,69646
1,3185,3185,3185,3185,3185,3185,3185,3185,3185,3185,3185,3185


In [14]:
labels = df.groupby('popular').count().iloc[:2,0].values
print (labels, np.round(100*labels/sum(labels), 2))

[69646  3185] [95.63  4.37]


## TFIDF Vectorizer

In [15]:
def prepare_vectors(df, feature_col, target_col, tfidf=True):
    df = df[[feature_col, target_col]]
    
    df_train, df_test = train_test_split(df, test_size=0.2,
                                         stratify=df[target_col],
                                         random_state = 8848)
    
    if tfidf:
        vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    else:
        vectorizer = CountVectorizer(max_df=0.95)
        
    vectorizer.fit_transform(df_train[feature_col].values)
    
    X_train = vectorizer.transform(df_train[feature_col].values)
    X_test  = vectorizer.transform(df_test[feature_col].values)

    y_train = df_train[target_col].values
    y_test  = df_test[target_col].values

    return X_train, X_test, y_train, y_test, vectorizer

## Logistic Regression models

In [16]:
columns = ["Accuracy", "Pr_0", "Pr_1", "Re_0", "Re_1", "F1_0", "F1_1"]
df_res = pd.DataFrame([], columns = columns)

In [17]:
# A dataframe to store results
def update_results(df_res, model, X, y, model_name):
    ypred = model.predict(X)
    ac = accuracy_score(y.ravel(), ypred.ravel() )
    pr, re, f1, _ = precision_recall_fscore_support(y.ravel(), ypred.ravel() )
    df_res.loc[model_name, "Accuracy"] = np.round(100*ac, 2)
    df_res.loc[model_name, ["Pr_0", "Pr_1"]] = np.round(100*pr, 2)
    df_res.loc[model_name, ["Re_0", "Re_1"]] = np.round(100*re, 2)
    df_res.loc[model_name, ["F1_0", "F1_1"]] = np.round(100*f1, 2)
    return df_res

In [18]:
def print_metrics(model, X, y):
    pred = model.predict(X)
    conf = confusion_matrix(y, pred)
    ac   = accuracy_score(y, pred)
    pr, re, f1, _ = precision_recall_fscore_support(y, pred)
    
    print ("Confusion Matrix:\n", conf)
    print (f"Accuracy Score: {np.round(100*ac,2)}")
    print (f"Precision Score: {np.round(100*pr,2)}")
    print (f"Recall Score: {np.round(100*re,2)}")
    print (f"F1 Score: {np.round(100*f1,2)}")


In [None]:
def grid_search_CV_LR(params, df=df,
                      xcol="text_processed",
                      ycol="popular",
                      N_cv=5,
                      wt_factor=1,
                      scorer = "precision",
                      verbose = 0,
                      tfidf=True
                     ):
    
    X_train, X_test, y_train, y_test, vectorizer = prepare_vectors(df,
                                                                   xcol,
                                                                   ycol,
                                                                   tfidf=tfidf)        
    
    scoring_dict = {"precision": precision_score, 
                    "recall"   : recall_score,
                    "accuracy" : accuracy_score,
                    "f1"       : f1_score}

    custom_score = make_scorer(scoring_dict[scorer])
    model_grid = GridSearchCV(estimator = LogisticRegression(max_iter=1000),
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=N_cv,
                                                   random_state=8848,
                                                   shuffle=True),
                              verbose=verbose,
                              scoring = custom_score)
    w = np.ones(len(y_train))
    w[y_train==1] = wt_factor/y_train.mean()
    
    grid_res = model_grid.fit(X_train, y_train, sample_weight=w)
    print ( "Best Score:", grid_res.best_score_, grid_res.best_params_)
    return model_grid, X_train, X_test, y_train, y_test, grid_res

In [None]:
df.head(1)

In [None]:
params = {'penalty': ['l1','l2','None'],
          'C'      : [1, 10, 20]}

model_grid1, _, X_test1, _, y_test1, grid_res1 = grid_search_CV_LR(params, 
                                                                   df=df, 
                                                                   xcol='text_processed',
                                                                   ycol='popular',
                                                                   scorer = "precision",
                                                                   tfidf=True)
                                                                           

In [None]:
params = {'penalty': ['l1','l2','None'],
          'C'      : [1, 10, 20]}

model_grid2, _, X_test2, _, y_test2, grid_res2 = grid_search_CV_LR(params, 
                                                                   df=df, 
                                                                   xcol='text_processed',
                                                                   ycol='popular',
                                                                   scorer = "precision",
                                                                   tfidf=False)
                                                                           

In [None]:
sns.set_style('whitegrid')
fig, ax = plt.subplots(1, 1, figsize=(12,5))
plot_precision_recall_curve(model_grid1, X_test1, y_test1, ax=ax, label="TFIDF")
plot_precision_recall_curve(model_grid2, X_test2, y_test2, ax=ax, label="BOW")
plt.xlabel("Recall (Positive Label:1)", fontsize=16)
plt.ylabel("Precision (Positive Label:1)", fontsize=16)

plt.legend(fontsize=16)
plt.show()

## Neural Net

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
xcol = 'text_processed'
ycol = 'popular'
df=df[[xcol, ycol]]

df_train_, df_test = train_test_split(df, test_size=0.2,
                                      stratify=df[ycol],
                                      random_state = 8848)
df_train, df_val = train_test_split(df_train_, test_size=0.2,
                                      stratify=df_train_[ycol],
                                      random_state = 8848)

(X_train, y_train) = df_train[xcol], df_train[ycol]
(X_val, y_val) = df_val[xcol], df_val[ycol]
(X_test, y_test) = df_test[xcol], df_test[ycol]

In [None]:
X_train.shape, X_val.shape

In [None]:
MAX_WORDS   = 1000
def prepare_data_for_model(X_train, X_val, X_test):
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_matrix(X_train)
    X_val   = tokenizer.texts_to_matrix(X_val)    
    X_test  = tokenizer.texts_to_matrix(X_test)
    return (X_train, X_val, X_test, tokenizer)

In [None]:
(X_train, X_val, X_test, tokenizer) = prepare_data_for_model(X_train, X_val, X_test)

## Dense Layer

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.layers import LSTM, Bidirectional, Conv1D
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPool1D, BatchNormalization
#from tensorflow.keras.layers.embeddings import Embedding

In [None]:
from tensorflow.keras import backend as K

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def make_model(X):
    model = Sequential()
    model.add(Dense(32, input_shape=(X.shape[1],), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics = ['acc', precision_m, recall_m, f1_m])
    return model

In [None]:
model1 = make_model(X_train)
display ( model1.summary() )
history1 = model1.fit(X_train, y_train,
                      validation_data=(X_val, y_val), 
                      batch_size=128,
                      epochs=20,
                      verbose=0)

In [None]:
loss, acc, pr, re, f1 = model1.evaluate(X_test, y_test, verbose=0)
print (f"Loss: {loss}, Accuracy: {acc}, Precision: {pr}, Recall:{re}, F1 {f1}")

In [None]:
history1.history.keys()

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.plot(history1.history['loss'], label='Training loss')
plt.plot(history1.history['val_loss'], label='validation loss')
plt.grid()
plt.legend()

plt.subplot(122)
plt.plot(history1.history['precision_m'], label='Training Precision')
plt.plot(history1.history['val_precision_m'], label='validation Precision')
plt.grid()
plt.legend()

In [None]:
plot_history(history1)

In [None]:
th = 0.5
pred = (model1.predict(X_test) > th).ravel().astype('int')
pred[:10]

In [None]:
ac = accuracy_score(y_test, pred)
pr = precision_score(y_test, pred)
re = recall_score(y_test, pred)
print (ac, pr, re)

## LSTM Layers

In [None]:
def make_model2(X, embedding_length = 16):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, embedding_length, input_length=X.shape[1]))
    model.add(LSTM(16, activation='relu'))
    model.add(Dropout(0.2))
    #model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model2 = make_model(X_train, y_train)
model2.summary()

In [None]:
print ("BOW: vs TFIDF")
fig, ax = plt.subplots(1, 1, figsize=(6,5))
plot_precision_recall_curve(model_grid, X_test, y_test, ax=ax, label="BOW")
plot_precision_recall_curve(model_grid2, X_test, y_test, ax=ax, label="TFIDF")
plt.legend(fontsize=16)
plt.show()