In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv('./joined_wallstreetbets_investing.csv')

In [4]:
nltk_stopwords = stopwords.words('english')

In [5]:
X = df['combined']
y = df['sub_target']

In [6]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, random_state=42)

In [7]:
# Baseline accuracy
y_validation.value_counts(normalize=True)

1    0.506667
0    0.493333
Name: sub_target, dtype: float64

## First Pipeline / CountVectorizer and Multinomial NB

In [16]:
# Setting up pipeline with CountVectorizer and MultinomialNB
# NEED TO TRY WITH TFIDF VECTORIZER AND MNB

pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=[None])),
    ('nb', MultinomialNB())
])

In [8]:
pipe_params = {'cvec__max_features': list(range(2000, 20_000, 4000)),
              'cvec__min_df': [2, 5, 10],
              'cvec__max_df': [.90, .95],
              'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)]}

In [9]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params, # what parameters values are we searching?
                  cv=5, # 3-fold cross-validation.
                  n_jobs=4,
                  verbose=1)

In [11]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   36.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 450 out of 450 | elapsed:  6.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
  

In [12]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 18000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

In [13]:
gs.best_score_

0.8675786763855143

## Second Pipeline / CountVectorizer and Multinomial NB

In [16]:
pipe_params2 = {'cvec__max_features': list(range(16_000, 42_000, 4000)),
              'cvec__min_df': [2, 3],
    nltk_stopwords      'cvec__max_df': [.80, .85, .90],
              'cvec__ngram_range': [(1, 1), (1, 2)]}

In [17]:
gs2 = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params2, # what parameters values are we searching?
                  cv=5, # 3-fold cross-validation.
                  n_jobs=4,
                  verbose=1)

In [18]:
gs2.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   27.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 420 out of 420 | elapsed:  4.4min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
  

In [19]:
gs2.best_params_

{'cvec__max_df': 0.8,
 'cvec__max_features': 24000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2)}

In [20]:
gs2.best_score_

0.8680645946623473

## Third Pipeline / CountVectorizer and Multinomial NB

In [21]:
pipe_params3 = {'cvec__max_features': [22_000, 24_000, 26_000],
              'cvec__min_df': [2, 3, 4],
              'cvec__max_df': [.70, .75, .80],
              'cvec__ngram_range': [(1, 1), (1, 2)]}

In [22]:
gs3 = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params3, # what parameters values are we searching?
                  cv=5, # 3-fold cross-validation.
                  n_jobs=4,
                  verbose=1)

In [23]:
gs3.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   26.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:  2.8min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
  

In [24]:
gs3.best_params_

{'cvec__max_df': 0.7,
 'cvec__max_features': 26000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

In [25]:
gs3.best_score_

0.8682729279956807

## Fourth Pipeline / CountVectorizer and Multinomial NB

In [26]:
pipe_params4 = {'cvec__max_features': [25_000, 26_000, 27_000, 28_000],
              'cvec__min_df': [2, 3],
              'cvec__max_df': [.40, .50, .60, .70],
              'cvec__ngram_range': [(1, 1), (1, 2)]}

In [27]:
gs4 = GridSearchCV(pipe, # what object are we optimizing?
                  pipe_params4, # what parameters values are we searching?
                  cv=5, # 3-fold cross-validation.
                  n_jobs=4,
                  verbose=1)

In [28]:
gs4.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   26.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  3.4min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
  

In [29]:
gs4.best_params_

{'cvec__max_df': 0.4,
 'cvec__max_features': 26000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

In [30]:
gs4.best_score_

0.8682729279956807

## Fifth Pipeline / TfidfVectorizer and Multinomial NB

In [17]:
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [32]:
pipe_tvec_params = {
    'tvec__max_features': list(range(4_000, 42_000, 4000)),
    'tvec__stop_words': [None, 'english', nltk_stopwords],
    'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

In [33]:
gs_tvec = GridSearchCV(pipe_tvec,
                      pipe_tvec_params,
                      cv=5,
                      n_jobs=4,
                      verbose=1)

In [34]:
gs_tvec.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   39.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 450 out of 450 | elapsed:  7.4min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=4,
             param_grid={'tvec__max_features': [4000, 8000, 12000, 16000, 20000,
                                                24000, 28000, 32000, 36000,
                                                40000],
                         'tvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'tvec__stop_words': [None, 'english',
                                              ['i', 'me', 'my', 'myself', 'we',
                                               'our', 'ours', 'ourselves',
                                               'you', "you're", "you've",
                                               "you'll", "you'd", 'your',
                                               'yours', 'yourself',
                                               'yourselves', 'he', 'him', 'his',
                        

In [35]:
gs_tvec.best_params_

{'tvec__max_features': 20000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'i

In [36]:
gs_tvec.best_score_

0.8692452948436113