## Modeling

---

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
pd.options.display.max_colwidth = 400

In [2]:
subreddit = pd.read_csv('../datasets/subreddit_combined.csv')

In [3]:
subreddit.head(2)

Unnamed: 0,link_flair_text,over_18,selftext,subreddit,title
0,Other,False,So in the venom 2 post credit venom says he recognizes peter Parker even though that venom has never met him because almost the moment venom arrived on earth he has been with Eddie or in a secret facility so could they have met in between movies?,Marvel,plot hole
1,Film/Television,False,"Alright, so I avoided this movie until today, what’s that like 12 years, because if the reviews and I gotta say, it was actually pretty good. Way better than the second Venom, let there be Carnage. \n\nForget other peoples opinions. Watch what you want.",Marvel,"Green lantern, not that bad."


In [4]:
X = subreddit[['selftext', 'title']]
y = subreddit['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55, stratify=y)

### Generating a Baseline Model

#### Since r/harrypotter is the majority class, it is the positive class (the one we're predicting)

In [5]:
y.value_counts(normalize=True)

harrypotter    0.636354
Marvel         0.363646
Name: subreddit, dtype: float64

### Making First Logistic Regression Model with Count Vectorizer

In [7]:
ct1 = make_column_transformer((CountVectorizer(), 'selftext'), (CountVectorizer(), 'title'))

pipe1 = make_pipeline(ct1, LogisticRegression(max_iter=10_000))

pipe1.fit(X_train, y_train)
pipe1.score(X_train, y_train), pipe1.score(X_test, y_test)

KeyboardInterrupt: 

In [None]:
ct2 = make_column_transformer((CountVectorizer(), 'selftext'), (CountVectorizer(), 'title'))

pipe2 = make_pipeline(ct1, LogisticRegression(max_iter=10_000))

pipe2.fit(X_train, y_train)
pipe2.score(X_train, y_train), pipe1.score(X_test, y_test)

#### Grid Searching with Logistic Regression and Count Vectorizer

In [142]:
pipe1.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('countvectorizer-1', CountVectorizer(),
                                    'selftext'),
                                   ('countvectorizer-2', CountVectorizer(),
                                    'title')])),
  ('logisticregression', LogisticRegression(max_iter=10000))],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('countvectorizer-1', CountVectorizer(),
                                  'selftext'),
                                 ('countvectorizer-2', CountVectorizer(),
                                  'title')]),
 'logisticregression': LogisticRegression(max_iter=10000),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('countvectorizer-1',
   CountVectorizer(),
   'selftext'),
  ('countvectorizer-2', Count

In [None]:
grid1 = {
    # 'columntransformer__countvectorizer-1__lowercase': [True, False],
    # 'columntransformer__countvectorizer-2__lowercase': [True, False],
    # 'columntransformer__countvectorizer-1__stop_words': [None, 'english'],
    # 'columntransformer__countvectorizer-2__stop_words': [None, 'english'],
    # 'columntransformer__countvectorizer-1__min_df': [1, 2],
    # 'columntransformer__countvectorizer-2__min_df': [1, 2],
    # 'columntransformer__countvectorizer-1__ngram_range': (1, 2),
    # 'columntransformer__countvectorizer-2__ngram_range': (1, 2),
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet'],
    'logisticregression__C': [0.001, 0.1, 1, 5, 50],
    'logisticregression__solver': ['lbfgs', 'liblinear', 'saga'],
}

gs1 = GridSearchCV(pipe1, param_grid=grid1, n_jobs=-1)
gs1.fit(X_train, y_train)