In [1]:
import sqlite3
import pandas as pd
import numpy as np
import re
import string
import datetime as dt
import matplotlib.pyplot as plt

In [2]:
conn = sqlite3.connect('../redditPosts.sqlite')
cur = conn.cursor()

## Get the list of existing subreddits in the database.

In [18]:
cur.execute('''SELECT subreddit FROM Posts GROUP BY subreddit ORDER BY (COUNT(id))''')
subreddits = list(map(lambda x: x[0], cur.fetchall()))

subreddits

['Superstonks',
 'finance',
 'options',
 'pennystocks',
 'GME',
 'CryptoCurrency',
 'dogecoin',
 'wallstreetbets']

In [4]:
for subreddit in subreddits:
    cur.execute('''SELECT AVG(score), COUNT(score), flag FROM
                    (SELECT B.score, (CASE
                                    WHEN B.body == '[deleted]' OR B.body == '[removed]' THEN 0
                                    ELSE 1
                                    END) AS flag
                    FROM PostBodyAndScore B
                    JOIN (SELECT id FROM Posts WHERE subreddit ==?) P
                    On P.id = B.id) comb
                    GROUP BY flag''', (subreddit,))

    print(subreddit,cur.fetchall())

Superstonks [(1.4615384615384615, 26, 0), (12.1, 120, 1)]
finance [(16.512359550561797, 445, 0), (49.87940705128205, 2496, 1)]
options [(5.675890340763515, 7806, 0), (27.118068996925878, 8783, 1)]
pennystocks [(5.620451248495324, 32399, 0), (72.21061123177164, 16115, 1)]
GME [(45.454079474483066, 40189, 0), (137.4685982088646, 105408, 1)]
CryptoCurrency [(3.595744680851064, 282, 0), (67.27064220183486, 218, 1)]
dogecoin [(3.3076923076923075, 26, 0), (49.43283582089552, 67, 1)]
wallstreetbets [(16.355170764287518, 221592, 0), (214.87820079930034, 90329, 1)]


## Retrieve data from the database based on the subreddit.

In [3]:
subreddit = 'GME'
cmd = '''SELECT P.title, P.author, P.ext_link, P.created, B.body, B.score 
            FROM (SELECT * FROM Posts WHERE subreddit = '{}') P 
            JOIN PostBodyAndScore B
            On P.id = B.id
            WHERE B.body != "[removed]" AND B.body != "[deleted]"'''.format(subreddit)

df = pd.read_sql(cmd, conn)

In [20]:
df.head()

Unnamed: 0,title,author,ext_link,created,body,score
0,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqfajb/y...,1609786946,After watching this I took a position RIGHT AW...,6
1,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqvp7l/t...,1609841999,This guy explained exactly how to take a posit...,2
2,ICR conference (11th Jan),nicky94,https://www.reddit.com/r/GME/comments/krnthg/i...,1609939549,Any speculation or ideas on what Gamestop migh...,14
3,"GME is FINALLY going to the moon, this technic...",TitsDownOnly,https://www.reddit.com/r/GME/comments/kuo3w1/g...,1610315957,"After some downwards movement, I think everyb...",11
4,Ryan Cohen appointed to board!!!!?,nicky94,https://news.gamestop.com/news-releases/news-r...,1610368592,,18


In [21]:
df[df['score']> 1000].shape

#df.shape

(3079, 6)

## Feature Engineering

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_model = df.copy()

df_model['Popular'] = 1*(df_model['score'] > 1000)

df_model.head()

Unnamed: 0,title,author,ext_link,created,body,score,Popular
0,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqfajb/y...,1609786946,After watching this I took a position RIGHT AW...,6,0
1,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqvp7l/t...,1609841999,This guy explained exactly how to take a posit...,2,0
2,ICR conference (11th Jan),nicky94,https://www.reddit.com/r/GME/comments/krnthg/i...,1609939549,Any speculation or ideas on what Gamestop migh...,14,0
3,"GME is FINALLY going to the moon, this technic...",TitsDownOnly,https://www.reddit.com/r/GME/comments/kuo3w1/g...,1610315957,"After some downwards movement, I think everyb...",11,0
4,Ryan Cohen appointed to board!!!!?,nicky94,https://news.gamestop.com/news-releases/news-r...,1610368592,,18,0


In [6]:
def emoji_count(s):
    s = ''.join(word.strip(string.punctuation) for word in s.split())
    return len(re.findall('[^\w\s,\(\)\'\/-]',s))

def hourOfDay(utc):
    return dt.datetime.utcfromtimestamp(utc).hour

def dayOfWeek(utc):
    return dt.datetime.utcfromtimestamp(utc).weekday()

def textLength(text):
    return len(text.split())

def postType(link):
    if not link:
        return 'others'
    if 'png' in link or 'jpg' in link:
        return 'figures'
    elif '/r/' in link:
        return 'others'
    else:
        return 'outside_link'

def featureProcessing(df_model):
    df_model['title_emoji'] = df_model['title'].apply(emoji_count)
    df_model['body_emoji'] = df_model['body'].apply(emoji_count)
    df_model['hour'] = df_model['created'].apply(hourOfDay)
    df_model['day'] = df_model['created'].apply(dayOfWeek)
    df_model['title_length'] = df_model['title'].apply(textLength)
    df_model['body_length'] = df_model['body'].apply(textLength)
    df_model['post_type'] = df_model['ext_link'].apply(postType)

In [7]:
featureProcessing(df_model)

In [8]:
df_model[['figures', 'outside_link']] = pd.get_dummies(df_model['post_type'])[['figures', 'outside_link']]

In [9]:
df_model.head()

Unnamed: 0,title,author,ext_link,created,body,score,Popular,title_emoji,body_emoji,hour,day,title_length,body_length,post_type,figures,outside_link
0,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqfajb/y...,1609786946,After watching this I took a position RIGHT AW...,6,0,6,3,19,0,8,11,others,0,0
1,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqvp7l/t...,1609841999,This guy explained exactly how to take a posit...,2,0,1,11,10,1,10,17,others,0,0
2,ICR conference (11th Jan),nicky94,https://www.reddit.com/r/GME/comments/krnthg/i...,1609939549,Any speculation or ideas on what Gamestop migh...,14,0,0,1,13,2,4,86,others,0,0
3,"GME is FINALLY going to the moon, this technic...",TitsDownOnly,https://www.reddit.com/r/GME/comments/kuo3w1/g...,1610315957,"After some downwards movement, I think everyb...",11,0,3,5,21,6,14,43,others,0,0
4,Ryan Cohen appointed to board!!!!?,nicky94,https://news.gamestop.com/news-releases/news-r...,1610368592,,18,0,0,0,12,0,5,0,outside_link,0,1


In [10]:
df_model['hour'] = df_model['hour'].astype('category')
df_model['day'] = df_model['day'].astype('category')


In [11]:
df_model = pd.concat([df_model, pd.get_dummies(df_model['hour'], prefix = 'hour_'), pd.get_dummies(df_model['day'], prefix = 'day')], axis = 1)
df_model.head()

Unnamed: 0,title,author,ext_link,created,body,score,Popular,title_emoji,body_emoji,hour,...,hour__21,hour__22,hour__23,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,You NEED to see this about GME 🚀🚀🚀🚀🚀🚀,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqfajb/y...,1609786946,After watching this I took a position RIGHT AW...,6,0,6,3,19,...,0,0,0,1,0,0,0,0,0,0
1,THIS CONVINCED ME TO ALL IN 💰GME (EXTREME PUMP...,TitsDownOnly,https://www.reddit.com/r/GME/comments/kqvp7l/t...,1609841999,This guy explained exactly how to take a posit...,2,0,1,11,10,...,0,0,0,0,1,0,0,0,0,0
2,ICR conference (11th Jan),nicky94,https://www.reddit.com/r/GME/comments/krnthg/i...,1609939549,Any speculation or ideas on what Gamestop migh...,14,0,0,1,13,...,0,0,0,0,0,1,0,0,0,0
3,"GME is FINALLY going to the moon, this technic...",TitsDownOnly,https://www.reddit.com/r/GME/comments/kuo3w1/g...,1610315957,"After some downwards movement, I think everyb...",11,0,3,5,21,...,1,0,0,0,0,0,0,0,0,1
4,Ryan Cohen appointed to board!!!!?,nicky94,https://news.gamestop.com/news-releases/news-r...,1610368592,,18,0,0,0,12,...,0,0,0,1,0,0,0,0,0,0


## Training the Model

Since the dataset is quite imbalanced (~3% popular posts), I decided to use oversampling to make a balanced dataset. One thing I realized is that I should not oversample before splitting the dataset into training/testing sets, as the testing datapoints will have leaked into the training set. 

By extension, the cross validation also shouldn't be performed if we oversample. I did not think of that when writing this block, so you can see that the cross validation gives increasingly good results as the complexity increases. 

In [12]:
df_features = df_model.drop(columns = ['title','author','ext_link','created','body','post_type', 'hour','day','score'])

X = df_features.drop(columns = ['Popular'])
y = df_model['Popular']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify = y)

### Modulizing the training process.

I will write a function to automate the training process, including cross validation.

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [20]:
def trainingImbalanced(gridModel, X_train, y_train):
    
    weights = np.ones(y_train.shape)
    
    weights[y_train == 1] = 1/np.mean(y_train)
    gridModel.fit(X_train, y_train, sample_weight = weights)

    
def trainingImbalancedAug(gridModel, X_train, y_train):
    X_0 = X_train[y_train == 0,:]
    X_1 = X_train[y_train == 1,:]
    
    augInd = np.random.choice(X_1.shape[0], size = X_0.shape[0], replace = True)
    
    X_0 = np.concatenate([X_0, np.zeros((X_0.shape[0],1))], axis = 1)
    X_1Aug = np.concatenate([X_1[augInd], np.ones((X_0.shape[0],1))], axis = 1)
    
    X_comb = np.concatenate([X_0, X_1Aug], axis = 0)
    
    gridModel.fit(X_comb[:,:-1], X_comb[:,-1])

def trainingSingle(model, X_train, y_train):
    weights = np.ones(y_train.shape)
    
    weights[y_train == 1] = 1/np.mean(y_train)
    
    model.fit(X_train, y_train, sample_weight = weights)
    
    
    
def scaleAndTrainTest(gridModel, X_train, y_train, X_test):
    scaler = StandardScaler()
    
    X_transformed = scaler.fit_transform(X_train)
    
    trainingImbalanced(gridModel, X_transformed, y_train)
    
    X_testTrans = scaler.transform(X_test)
    
    return gridModel.predict(X_testTrans)


def trainAndShowConfusionMatrix(gridModel, X_train, y_train, X_test, y_test):
    yPred = scaleAndTrainTest(gridModel, X_train, y_train, X_test)

    #trainingSingle(rf, X_train, y_train)
    print(confusion_matrix(y_test, yPred))
    try:
        print(gridModel.best_estimator_)
    except:
        pass


In [140]:
tree_clf = GridSearchCV(DecisionTreeClassifier(), param_grid = {'max_depth': range(4,7)})

trainAndShowConfusionMatrix(tree_clf, X_train, y_train, X_test, y_test)


[[14368  6098]
 [  236   380]]
          0
0  0.571224
1  0.630374
2  0.607464


In [132]:
rf_clf = GridSearchCV(RandomForestClassifier(), param_grid = {'max_depth': range(3,7)})
trainAndShowConfusionMatrix(rf_clf, X_train, y_train, X_test, y_test)

[[13634  6832]
 [  206   410]]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.069444,0.0314,0.085088,0.007415,3,{'max_depth': 3},0.677813,0.637771,0.623955,0.63279,0.624904,0.639446,0.019853,4
1,2.45274,0.04207,0.087253,0.007179,4,{'max_depth': 4},0.635598,0.670323,0.674414,0.65799,0.664097,0.660485,0.013634,2
2,2.889295,0.090615,0.103515,0.006342,5,{'max_depth': 5},0.660441,0.632315,0.657753,0.647851,0.669493,0.653571,0.012671,3
3,3.254424,0.037295,0.107142,0.003429,6,{'max_depth': 6},0.638978,0.679514,0.674533,0.648977,0.679454,0.664291,0.016981,1


In [143]:
lr_clf = GridSearchCV(LogisticRegression(), param_grid = {'penalty': ('l2','none'), 'C': (0.1,0.5,1,2)})

trainAndShowConfusionMatrix(lr_clf, X_train, y_train, X_test, y_test)




[[14194  6272]
 [  223   393]]
          0
0  0.694211
1  0.694175
2  0.694187
3  0.694175
4  0.694175
5  0.694175
6  0.694175
7  0.694175


SVM is simply too slow for >100K entries, so we should not use that.

```Python

svm_clf = GridSearchCV(SVC(), param_grid = {'kernel' : ('linear', 'poly', 'rbf'), 'degree' : (2,3,4)})

trainAndShowConfusionMatrix(svm_clf, X_train, y_train, X_test, y_test)

```

In [17]:
xgb = XGBClassifier(verbosity=0)

param_grid = {'max_depth': [1, 5, 10]}

gs = GridSearchCV(xgb, param_grid=param_grid)

trainAndShowConfusionMatrix(gs, X_train, y_train, X_test, y_test)



[[18061  2405]
 [  491   125]]
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)


In [23]:
ada_clf = AdaBoostClassifier(n_estimators= 500)

trainAndShowConfusionMatrix(ada_clf, X_train, y_train, X_test, y_test)

[[14214  6252]
 [  222   394]]
