In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import text
import dataframe_image as dfi


In [33]:
df = pd.read_csv('../data/ufc_mma_submissions.csv')

In [34]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [6]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe1_params = {
    'cvec__max_features' : [5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000],
    'cvec__min_df': [2, 3],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_df' : [0.9, 0.95, 0.98],
    'cvec__ngram_range': [(1,1), (1,2)]
}

gs_pipe1 = GridSearchCV(pipe1,
                        param_grid=pipe1_params,
                        cv = 5,
                        n_jobs = -1)

gs_pipe1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvec__max_df': [0.9, 0.95, 0.98],
                         'cvec__max_features': [5000, 6000, 7000, 8000, 9000,
                                                10000, 11000, 12000],
                         'cvec__min_df': [2, 3],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': [None, 'english']})

In [7]:
print(f'Best Score: {gs_pipe1.best_score_}')
print(f'Best Params: {gs_pipe1.best_params_}')

Best Score: 0.7417417216561439
Best Params: {'cvec__max_df': 0.9, 'cvec__max_features': 10000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': None}


In [8]:
print(f'Training Score: {gs_pipe1.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe1.score(X_test, y_test)}')

Training Score: 0.8344608038201353
Testing Score: 0.749005568814638


---

In [35]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [36]:
pipe2 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

pipe2_params = {
    'tvec__max_features' : [5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__max_df': [0.9, 0.95, 0.98],
    'tvec__min_df': [2, 3]
}

gs_pipe2 = GridSearchCV(pipe2,
                        param_grid=pipe2_params,
                        cv = 5,
                        n_jobs = -1)

gs_pipe2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'tvec__max_df': [0.9, 0.95, 0.98],
                         'tvec__max_features': [5000, 6000, 7000, 8000, 9000,
                                                10000, 11000, 12000],
                         'tvec__min_df': [2, 3],
                         'tvec__ngram_range': [(1, 1), (1, 2)],
                         'tvec__stop_words': [None, 'english']})

In [11]:
print(f'Best Score: {gs_pipe2.best_score_}')
print(f'Best Params: {gs_pipe2.best_params_}')

Best Score: 0.7482417154956797
Best Params: {'tvec__max_df': 0.9, 'tvec__max_features': 6000, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': None}


In [12]:
print(f'Training Score: {gs_pipe2.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe2.score(X_test, y_test)}')

Training Score: 0.8194720785250034
Testing Score: 0.7474144789180589


---

## Top Words Before Adding Stop Words

In [39]:
cvec = CountVectorizer(max_features=10000,
                             max_df = 0.9,
                             min_df = 2,
                             ngram_range=(1,2),
)
vectorized = cvec.fit_transform(X)
vectorized = pd.DataFrame(vectorized.todense(), columns = cvec.get_feature_names_out())
top_ufc = vectorized[df['subreddit'] == 1].sum().sort_values(ascending = False)[:20]
top_mma = vectorized[df['subreddit'] == 0].sum().sort_values(ascending = False)[:20]

In [40]:
count_index = []
ufc_count = []
mma_count = []
for i in top_ufc.index:
    if i in top_mma.index:
        count_index.append(i)
        ufc_count.append(top_ufc[i])
        mma_count.append(top_mma[i])

In [41]:
top_words = pd.DataFrame(index = count_index)
top_words['ufc'] = ufc_count
top_words['mma'] = mma_count

In [42]:
dfi.export(top_words, "../images/topwords_no_stop_words.png")
top_words

Unnamed: 0,ufc,mma
the,2111,1681
to,1104,1148
ufc,877,1130
is,867,549
in,809,939
and,744,707
of,734,759
this,725,260
fight,607,725
for,579,662


---

### Custom Stop Words

In [6]:
my_words_list= (['ufc', 'dana', 'white', 'ultimate', 'u.f.c.', 'islam', 'makhachev', 'moreno', 'edwards', 'usman', 'ngannou', 'adesanya',
                  'pantoja', 'kara', 'kai', 'oliveira', 'pereira', 'sterling', 'royval', 'nicolau', 'perez', 'albazi', 'schnell', 'omalley', 'yan',
                  'dvalishvili', 'vera', 'sandhagen', 'font', 'cruz', 'holloway', 'volkanovski', 'figueiredo', 'deiveson', 'aljamain', 'rodriguez',
                  'ortega', 'allen', 'emmett', 'chan', 'sung', 'jung', 'kattar', 'giga', 'chikadze', 'poirier', 'jones', 'elliott', 'dvorak', 'molina', 'mokaev',
                  'ulanbekov', 'yanez', 'gutierrez', 'nurmagomedov', 'simon', 'munhoz', 'shore', 'topuria', 'evloev', 'mitchell', 'yusuff', 'iga', 'barboza',
                  'caceres', 'burns', 'neal', 'luque', 'fiziev', 'gamrot', 'anjos', 'tsarukyan', 'turner', 'hooker', 'ismagulov', 'gaethje', 'magny', 'whittaker',
                  'vettori', 'strickland', 'costa', 'hermansson', 'covington', 'muniz', 'imavov', 'bachowicz', 'rakic', 'cannonier', 'dolidze', 'brunson', 'oezdemir',
                  'spann', 'walker', 'nunes', 'weili', 'shevchenko', 'pena', 'blaydes', 'tuivasa', 'aspinall', 'andrade', 'santos', 'daukaus', 'tybura', 'lewis', 'holm',
                  'vieira', 'jandiroba', 'maia', 'grasso', 'chookagian', 'murphy', 'fiorot', 'lemos', 'namajunas', 'esparza', 'jandiroba', 'blanchfield', 'barber',
                  'calvillo', 'ribas', 'viana', 'ducote', 'pinheiro', 'xiaonan', 'yan', 'abdurakhimov', 'spivac', 'shamil', 'ketlen', 'pennington', 'miesha', 'kunitskaya',
                  'rosa', 'avila', 'lansberg', 'paddy', 'silva', 'cormier', 'diaz', 'miocic', 'lesnar', 'penn', 'liddell', 'pierre', 'rousey', 'khabib', 'conor', 'mcgregor',
                  'frevola', 'dillashaw', 'pimblett', 'helwani', 'blachowicz','arlovski', 'donatello', 'dec', 'december', 'jan', 'feb', 'selftext', 'says', 'did', 'does',
                  'guy', 'guys', 'know', 'fc', 'vs', 'https', 'khamzat', '2022', '2023', '219', '281', '282', '283', '284', '285', 'going', 'man', 'got', 'anne', 'didnt', 
                  'ufc281', 'ankalaev', 'zhang', 'israel', 'johnson', 'dustin', 'krause', 'chandler', 'jiri', 'cejudo', 'march', 'februrary', 'gordon', 'ilia', 'florian',
                  'makachov', 'beneil', 'dariush', 'jared'])
stop_words_list = text.ENGLISH_STOP_WORDS.union(my_words_list)

# Got help with this combination from https://stackoverflow.com/questions/26826002/adding-words-to-stop-words-list-in-tfidfvectorizer-in-sklearn
# UFC fighters taken from https://www.ufc.com/rankings

In [7]:
X = df['title']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [25]:
pipe4 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('nb', MultinomialNB())
])

pipe4_params = {
    'cvec__max_features': [3000, 4000, 5000],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__max_df': [0.01, 0.05, 0.1],
    'cvec__min_df': [2, 3],
}

gs_pipe4 = GridSearchCV(pipe4,
                        param_grid=pipe4_params,
                        cv = 5)

gs_pipe4.fit(X_train, y_train)

print(f'Best Score: {gs_pipe4.best_score_}')
print(f'Best Params: {gs_pipe4.best_params_}')
print(f'Training Score: {gs_pipe4.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe4.score(X_test, y_test)}')

Best Score: 0.7080514627582335
Best Params: {'cvec__max_df': 0.1, 'cvec__max_features': 4000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 1)}
Training Score: 0.8009019763894416
Testing Score: 0.7163882259347654


In [18]:
pipe3 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=stop_words_list)),
    ('nb', MultinomialNB())
])

pipe3_params = {
    'tvec__max_features': [2000, 4000, 6000, 8000, 10000],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__max_df': [0.8, 0.9, 0.95, 0.98, 1.0],
    'tvec__min_df': [2, 3, 4],
}

gs_pipe3 = GridSearchCV(pipe3,
                        param_grid=pipe3_params,
                        cv = 5)

gs_pipe3.fit(X_train, y_train)

print(f'Best Score: {gs_pipe3.best_score_}')
print(f'Best Params: {gs_pipe3.best_params_}')

Best Score: 0.7107037186322361
Best Params: {'tvec__max_df': 0.8, 'tvec__max_features': 4000, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 1)}


In [19]:
print(f'Training Score: {gs_pipe3.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe3.score(X_test, y_test)}')

Training Score: 0.8180129990714949
Testing Score: 0.7183770883054893


---

## Stacked Models

In [9]:
lvl1_est_1 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('ada', AdaBoostClassifier())
]

lvl1_est_2 = [
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('logr', LogisticRegression(max_iter=1000))
]

lvl1_est_3 = [
    ('nb', MultinomialNB()),
    ('logr', LogisticRegression(max_iter=1000)),
    ('ada', AdaBoostClassifier())
]

lvl1_est_4 = [
    ('logr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier()),
    ('ada', AdaBoostClassifier())
]

In [10]:
stacked_1 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)

pipe_tvec_1 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=stop_words_list)),
    ('s1', stacked_1)
])

print(cross_val_score(pipe_tvec_1, X_train, y_train).mean())
pipe_tvec_1.fit(X_train, y_train)
print(f'Training Score: {pipe_tvec_1.score(X_train, y_train)}')
print(f'Testing Score: {pipe_tvec_1.score(X_test, y_test)}')

0.7266216542078612
Training Score: 0.9506565857540787
Testing Score: 0.737867939538584


In [11]:
stacked_2 = StackingClassifier(estimators=lvl1_est_2, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_tvec_2 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=stop_words_list)),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_tvec_2, X_train, y_train).mean())
pipe_tvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_tvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_tvec_2.score(X_test, y_test)}')

0.7276829261853173
Training Score: 0.9416368218596631
Testing Score: 0.733890214797136


In [12]:
stacked_3 = StackingClassifier(estimators=lvl1_est_3, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_tvec_3 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=stop_words_list)),
    ('s3', stacked_3)
])

print(cross_val_score(pipe_tvec_3, X_train, y_train).mean())
pipe_tvec_3.fit(X_train, y_train)
print(f'Training Score: {pipe_tvec_3.score(X_train, y_train)}')
print(f'Testing Score: {pipe_tvec_3.score(X_test, y_test)}')

0.7282121980712466
Training Score: 0.8670911261440509
Testing Score: 0.7390612569610183


In [13]:
stacked_4 = StackingClassifier(estimators=lvl1_est_4, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_tvec_4 = Pipeline([
    ('tvec', TfidfVectorizer(stop_words=stop_words_list)),
    ('s4', stacked_4)
])

print(cross_val_score(pipe_tvec_4, X_train, y_train).mean())
pipe_tvec_4.fit(X_train, y_train)
print(f'Training Score: {pipe_tvec_4.score(X_train, y_train)}')
print(f'Testing Score: {pipe_tvec_4.score(X_test, y_test)}')

0.7260909742158169
Training Score: 0.9429632577264889
Testing Score: 0.7311058074781225


In [14]:
stacked_1 = StackingClassifier(estimators=lvl1_est_1, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_1 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s1', stacked_1)
])

print(cross_val_score(pipe_cvec_1, X_train, y_train).mean())
pipe_cvec_1.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_1.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_1.score(X_test, y_test)}')

0.7302029960977858
Training Score: 0.94455498076668
Testing Score: 0.7374701670644391


In [15]:
# BEST MODEL THUS FAR
stacked_2 = StackingClassifier(estimators=lvl1_est_2, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_2 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s2', stacked_2)
])

print(cross_val_score(pipe_cvec_2, X_train, y_train).mean())
pipe_cvec_2.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_2.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_2.score(X_test, y_test)}')

0.7287435821163483
Training Score: 0.943891762833267
Testing Score: 0.7410501193317423


In [16]:
stacked_3 = StackingClassifier(estimators=lvl1_est_3, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_3 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s3', stacked_3)
])

print(cross_val_score(pipe_cvec_3, X_train, y_train).mean())
pipe_cvec_3.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_3.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_3.score(X_test, y_test)}')

0.7248967242171369
Training Score: 0.882345138612548
Testing Score: 0.7410501193317423


In [17]:
stacked_4 = StackingClassifier(estimators=lvl1_est_4, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)
pipe_cvec_4 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('s4', stacked_4)
])

print(cross_val_score(pipe_cvec_4, X_train, y_train).mean())
pipe_cvec_4.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_4.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_4.score(X_test, y_test)}')

0.7195904523364881
Training Score: 0.9663085289826236
Testing Score: 0.7307080350039777


#### The best model so far:
> the Stacked Model with CountVectorizer as the transformer, the first level estimators based on a Multinomial NB, a Random Forest Classifier, and an Logistic Classifier, and the final estimator being another Logistic Regression Model.

---
## Gridsearching Through the First Level Estimators for Best Parameters

### Starting With RandomSearch

Going through each individual estimator in the first level of my stacking model to find the best hyperparameters. This will hopefully mean my final stacking model will be even better.

In [104]:
#Starting with MultinomialNB
pipe1 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('nb', MultinomialNB())
])

pipe1_params = {
    'cvec__max_features' : range(500, 10001, 100),
    'cvec__min_df': range(1, 16),
    'cvec__max_df' : [0.9, 0.95, 0.98, 1],
    'cvec__ngram_range': [(1,1), (1,2), (2,2), (1,3)],
    'nb__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

rs_pipe1 = RandomizedSearchCV(pipe1,
                        param_distributions=pipe1_params,
                        cv = 5)

rs_pipe1.fit(X_train, y_train)

print(f'Best Score: {rs_pipe1.best_score_}')
print(f'Best Params: {rs_pipe1.best_params_}')
print(f'Training Score: {rs_pipe1.score(X_train, y_train)}')
print(f'Testing Score: {rs_pipe1.score(X_test, y_test)}')

Best Score: 0.7034080568311628
Best Params: {'nb__alpha': 1.0, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 1, 'cvec__max_features': 2100, 'cvec__max_df': 0.9}
Training Score: 0.7672105053720653
Testing Score: 0.7048528241845664


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\jeffr\anaconda3\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File

In [27]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('nb', MultinomialNB())
])

pipe1_params = {
    'cvec__max_features' : [3000, 4000, 5000],
    'cvec__min_df': [2],
    'cvec__max_df' : [0.01, 0.05, 0.1, 0.15],
    'cvec__ngram_range': [(1,1)],
    'nb__alpha': [0.5, 0.6, 0.7]
}

gs_pipe1 = GridSearchCV(pipe1,
                        param_grid=pipe1_params,
                        cv = 5,
                        n_jobs= -1)

gs_pipe1.fit(X_train, y_train)

print(f'Best Score: {gs_pipe1.best_score_}')
print(f'Best Params: {gs_pipe1.best_params_}')
print(f'Training Score: {gs_pipe1.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe1.score(X_test, y_test)}')

Best Score: 0.709112382709161
Best Params: {'cvec__max_df': 0.1, 'cvec__max_features': 4000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 1), 'nb__alpha': 0.7}
Training Score: 0.8047486404032365
Testing Score: 0.7143993635640413


In [107]:
pipe2 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('rf', RandomForestClassifier())
])

pipe2_params = {
    'cvec__max_features' : [8000, 8200, 8400],
    'cvec__min_df': [3, 4, 5],
    'cvec__max_df' : [0.9, 0.95, 0.98],
    'cvec__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': range(50, 10001, 50),
    'rf__max_depth': range(2, 101, 2),
    'rf__min_samples_leaf': range(1, 21, 1),
    'rf__min_samples_split': range(2, 101, 2)
}

rs_pipe2 = RandomizedSearchCV(pipe2,
                              param_distributions=pipe2_params,
                              cv = 5,
                              n_jobs = -1)

rs_pipe2.fit(X_train, y_train)

print(f'Best Score: {rs_pipe2.best_score_}')
print(f'Best Params: {rs_pipe2.best_params_}')
print(f'Training Score: {rs_pipe2.score(X_train, y_train)}')
print(f'Testing Score: {rs_pipe2.score(X_test, y_test)}')

Best Score: 0.7072550907436383
Best Params: {'rf__n_estimators': 5700, 'rf__min_samples_split': 22, 'rf__min_samples_leaf': 2, 'rf__max_depth': 74, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 4, 'cvec__max_features': 8200, 'cvec__max_df': 0.98}
Training Score: 0.7445284520493434
Testing Score: 0.6984884645982498


In [None]:
pipe2 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('rf', RandomForestClassifier(n_jobs = -1))
])

pipe2_params = {
    'cvec__max_features' : [8000, 8200, 8400],
    'cvec__min_df': [3, 4, 5],
    'cvec__max_df' : [0.9, 0.95, 0.98],
    'cvec__ngram_range': [(1,1)],
    'rf__n_estimators': [5600, 5700, 5800],
    'rf__max_depth': [72,74,76],
    'rf__min_samples_leaf': [1, 2, 3],
    'rf__min_samples_split': [3, 4, 5],
    'rf__ccp_alpha': [0.01, 0.1, 1.0]
}

gs_pipe2 = GridSearchCV(pipe2,
                        param_grid=pipe2_params,
                        cv = 5,
                        n_jobs = -1)

gs_pipe2.fit(X_train, y_train)

print(f'Best Score: {gs_pipe2.best_score_}')
print(f'Best Params: {gs_pipe2.best_params_}')
print(f'Training Score: {gs_pipe2.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe2.score(X_test, y_test)}')

In [16]:
pipe3 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('log', LogisticRegression())
])

pipe3_params = {
    'log__penalty': ['l2'],
    'log__solver': ['newton-cg', 'sag', 'lbfgs'],
    'log__C': [0.07, 0.08, 0.09]
}

gs_pipe3 = GridSearchCV(pipe3,
                        param_grid=pipe3_params,
                        cv = 5,
                        n_jobs = -1)

gs_pipe3.fit(X_train, y_train)

print(f'Best Score: {gs_pipe3.best_score_}')
print(f'Best Params: {gs_pipe3.best_params_}')
print(f'Training Score: {gs_pipe3.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe3.score(X_test, y_test)}')

Best Score: 0.7259577321746967
Best Params: {'log__C': 0.08, 'log__penalty': 'l2', 'log__solver': 'sag'}
Training Score: 0.8076667993102533
Testing Score: 0.7303102625298329


In [20]:
pipe3 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list)),
    ('log', LogisticRegression(max_iter = 10000))
])

pipe3_params = {
    'log__penalty': ['l1'],
    'log__solver': ['liblinear', 'saga'],
    'log__C': [0.9, 1, 1.1]
}

gs_pipe3 = GridSearchCV(pipe3,
                        param_grid=pipe3_params,
                        cv = 5,
                        n_jobs = -1)

gs_pipe3.fit(X_train, y_train)

print(f'Best Score: {gs_pipe3.best_score_}')
print(f'Best Params: {gs_pipe3.best_params_}')
print(f'Training Score: {gs_pipe3.score(X_train, y_train)}')
print(f'Testing Score: {gs_pipe3.score(X_test, y_test)}')

Best Score: 0.7114993865937737
Best Params: {'log__C': 1, 'log__penalty': 'l1', 'log__solver': 'saga'}
Training Score: 0.8188088605915904
Testing Score: 0.7143993635640413


In [28]:
lvl1_est = [
    ('nb', MultinomialNB(alpha = 0.7)),
    ('rf', RandomForestClassifier(max_depth = 92, 
                                  min_samples_leaf = 1,
                                  min_samples_split = 5, 
                                  n_estimators = 50,
                                  ccp_alpha = 0.01)),
    ('log', LogisticRegression(penalty='l2', 
                               solver='sag', 
                               C = 0.08))
]

stacked_1 = StackingClassifier(estimators=lvl1_est, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)

pipe_cvec_1 = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list,
                            max_df=0.1,
                            min_df = 2,
                            max_features=4000,
                            ngram_range= (1, 1))),
    ('s1', stacked_1)
])

print(cross_val_score(pipe_cvec_1, X_train, y_train).mean())
pipe_cvec_1.fit(X_train, y_train)
print(f'Training Score: {pipe_cvec_1.score(X_train, y_train)}')
print(f'Testing Score: {pipe_cvec_1.score(X_test, y_test)}')

0.7271520701800087
Training Score: 0.8182782862448601
Testing Score: 0.7374701670644391


In [31]:
lvl1_est = [
    ('nb', MultinomialNB(alpha = 0.7)),
    ('rf', RandomForestClassifier(max_depth = 92, 
                                  min_samples_leaf = 1,
                                  min_samples_split = 5, 
                                  n_estimators = 50,
                                  ccp_alpha = 0.01)),
    ('log', LogisticRegression(penalty='l2', 
                               solver='sag', 
                               C = 0.08))
]

stacked_1 = StackingClassifier(estimators=lvl1_est, 
                               final_estimator=LogisticRegression(),
                               n_jobs = -1)

pipe_final = Pipeline([
    ('cvec', CountVectorizer(stop_words=stop_words_list,
                            max_df=0.1,
                            min_df = 2,
                            max_features=4000,
                            ngram_range= (1, 1))),
    ('s1', stacked_1)
])


pipe_final_params = {
    's1__final_estimator__penalty': ['l1', 'l2'],
    's1__final_estimator__solver': ['lbfgs', 'saga', 'sag', 'liblinear'],
    's1__final_estimator__C': [0.1, 1, 2]
}

gs_final = GridSearchCV(pipe_final,
                        param_grid= pipe_final_params,
                        cv = 5,
                        n_jobs = -1)
    
    
gs_final.fit(X_train, y_train)
print(f'Best Score: {gs_final.best_score_}')
print(f'Best Params: {gs_final.best_params_}')
print(f'Training Score: {gs_final.score(X_train, y_train)}')
print(f'Testing Score: {gs_final.score(X_test, y_test)}')

30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 488, in fit
    return super().fit(X, self._le.transform(y), sample_weight)
  File "C:\Users\jeffr\anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 217, in fit


Best Score: 0.7286109561216533
Best Params: {'s1__final_estimator__C': 1, 's1__final_estimator__penalty': 'l1', 's1__final_estimator__solver': 'saga'}
Training Score: 0.8177477118981297
Testing Score: 0.7390612569610183
