## Binary Classification Modeling - r/biology & r/biochemistry Predicting with Logistic Regression

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, f1_score, precision_score, balanced_accuracy_score

#lemmatizer
import nltk
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
# Import stemmer.
from nltk.stem.porter import PorterStemmer
# Import RegEx Tokenizer
from nltk.tokenize import RegexpTokenizer, word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#read in the submissions csv file
#get rid of unnamed:0 column wtih index_col
submissions = pd.read_csv('datasets/cleaned-submission.csv',index_col=0)

In [3]:
#get rid of warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
#observing the last two models
submissions.tail(2)

Unnamed: 0,subreddit,selftext,title
6227,Biochemistry,so far ive mostly looked for stuff in my homet...,wanting to take a year off between undergrad a...
6228,Biochemistry,im currently taking a biochemistry class at un...,biochemistry help


In [5]:
#remove all words that apply to the target variable -- biology,bio,biochem,biochemistry
submissions['selftext'].replace('biology','',regex=True,inplace=True)
submissions['selftext'].replace('biochemistry','',regex=True,inplace=True)
submissions['selftext'].replace('chemistry','',regex=True,inplace=True)
submissions['selftext'].replace('biochem','',regex=True,inplace=True)
submissions['selftext'].replace('bio','',regex=True,inplace=True)
submissions['selftext'].replace('chem','',regex=True,inplace=True)

In [6]:
#removing any null rows
submissions.dropna(how='any',axis=0,inplace=True)

In [7]:
#look for null values
submissions.isna().sum()

subreddit    0
selftext     0
title        0
dtype: int64

In [8]:
#check to make sure word drop occurred correctly
submissions.tail(2)

Unnamed: 0,subreddit,selftext,title
6227,Biochemistry,so far ive mostly looked for stuff in my homet...,wanting to take a year off between undergrad a...
6228,Biochemistry,im currently taking a class at university im ...,biochemistry help


## Functions and Classes

In [9]:
#step over collection of tokens and try to lemmatize each of them
#to use in countvectorizer we pass the new class as the tokenizer
class LemmaTokenizer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self,doc):
        return [self.lemmatizer.lemmatize(t) for t in word_tokenize(doc)]

In [10]:
#step over collection of word tokens and stemmatize
#create a class to pass into Countvectorizer in a pipeline
class StemTokenizer:
    def __init__(self):
        self.stemmatizer = PorterStemmer()
    def __call__(self,doc):
        return [self.stemmatizer.stem(t) for t in word_tokenize(doc)]

## Stopwords

In [11]:
#longer list of stop words
#taken from the open source work found here: https://gist.github.com/sebleier/554280
#txt found here: https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt

stop_word = pd.read_csv('datasets/stopwords.csv',index_col=0)
stop_word = list(stop_word['stopwords'])

#remove punctuation from the stop words, as it has already been done in cleaning the text
stop_word = [word.replace("'",'') for word in stop_word]

## Logistic Regression Modeling With Selftext

In [12]:
#setting up X and y values for modeling
X = submissions['selftext']
y = np.where(submissions['subreddit']=='Biochemistry',1,0)

In [13]:
#train/test split the data
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state = 18)

## Null Model Accuracy

In [14]:
#to get the baseline accuracy of the model
#based on the most frequent value in the training data
#biochemistry = 1, biology = 0

biochem_num = y_train.sum()
biology_num = len(y_train)-biochem_num

if biology_num < biochem_num:
    baseline_accur = round(biochem_num/len(y_train),4)
    print(f'The most frequent class is r/biochemistry. The accuracy of the null model is {baseline_accur}.')
    
else:
    baseline_accuracy = round((biology_num)/len(y_train),4)
    print(f'The most frequent class is r/biology. The accuracy of the null model is {baseline_accuracy}.')

The most frequent class is r/biochemistry. The accuracy of the null model is 0.5686.


#### Baseline/Null Model Explained:

The baseline model allows us to find a 'starting point' to compare the performance of future models to. In binary classification, a customary baseline/null model is one that will guess the most frequently occuring class in the training set.

## Logistic Regression with CountVectorization

### Model One. Basic Logistic Regression

In [14]:
#make logistic pipeline
cvect = CountVectorizer(stop_words = stop_word)
pipe = make_pipeline(cvect,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))

In [15]:
#fit training data to the logistic regression pipe
pipe.fit(X_train,y_train)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b',
                                             '6o', 'a', 'a1', 'a2', 'a3', 'a4',
                                             'ab', 'able', 'about', 'above',
                                             'abst', 'ac', 'accordance',
                                             'according', 'accordingly',
                                             'across', 'act', 'actually', 'ad',
                                             'added', 'adj', 'ae', 'af',
                                             'affected', ...])),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('logisticregression', LogisticRegression(max_iter=10000))])

In [16]:
#score on training data
pipe.score(X_train,y_train)

0.9953714981729598

In [17]:
#scores on the different classification metrics
print(f'The accuracy of the Logistic Regression model is {round(pipe.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,pipe.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,pipe.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,pipe.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.6558.
The recall of the Logistic Regression model is 0.6842.
The f1 score of the Logistic Regression model is 0.6932.
The precision score of the Logistic Regression model is 0.7023.


### Model Interpretation

Model One is a 'default' model, without hyperparameter tuning. Comparing the accuracy on the training data and the testing data show that it is highly overfit. This high variance makes the model poor at generalizing, corresponding to the low accuracy score of the model.

### Model Two. GridSearchCV Basic Logistic Regression

In [18]:
#parameter grid for gridsearch
param_log = {
    'countvectorizer__max_features':[2_950,3_000,3_050],
    'countvectorizer__ngram_range': [(1,1),(1,2)],
    'logisticregression__C': [.0001,.001,.1]
}

In [19]:
#instantiate gridsearch for logistic regression
grid_log = GridSearchCV(pipe,param_grid=param_log,n_jobs=-1)

#fit training data to the gridsearch
grid_log.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [20]:
#score on the training data
grid_log.score(X_train,y_train)

0.8490864799025578

In [21]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_log.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_log.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_log.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_log.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7231.
The recall of the Logistic Regression model is 0.8367.
The f1 score of the Logistic Regression model is 0.7744.
The precision score of the Logistic Regression model is 0.7208.


In [22]:
#the best parameters
grid_log.best_params_

{'countvectorizer__max_features': 2950,
 'countvectorizer__ngram_range': (1, 2),
 'logisticregression__C': 0.001}

In [23]:
#make the results of the gridsearch into a dataframe
pd.DataFrame(grid_log.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_countvectorizer__max_features,param_countvectorizer__ngram_range,param_logisticregression__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,4.19117,0.42292,0.401769,0.042739,2950,"(1, 2)",0.001,"{'countvectorizer__max_features': 2950, 'count...",0.738733,0.732643,0.719854,0.73447,0.721681,0.729476,0.007403,1
10,3.855334,0.047171,0.398162,0.008849,3000,"(1, 2)",0.001,"{'countvectorizer__max_features': 3000, 'count...",0.738733,0.727162,0.717418,0.735079,0.724117,0.728502,0.00764,2
16,3.745006,0.076847,0.355723,0.051313,3050,"(1, 2)",0.001,"{'countvectorizer__max_features': 3050, 'count...",0.736906,0.72838,0.719245,0.732643,0.722899,0.728015,0.006383,3
1,1.532263,0.230686,0.273455,0.0463,2950,"(1, 1)",0.001,"{'countvectorizer__max_features': 2950, 'count...",0.733252,0.724726,0.718027,0.732643,0.726553,0.72704,0.0056,4
7,1.371288,0.01291,0.276636,0.028729,3000,"(1, 1)",0.001,"{'countvectorizer__max_features': 3000, 'count...",0.730816,0.718636,0.718636,0.732643,0.725944,0.725335,0.005892,5


### Model Interpretation

Model Two, a hyperparameter tuning of Model One, improved the default to 72.3% accuracy and 84% recall. This model still shows overfitting, with a training score of 85% accuracy and testing score of 72.3%. This overfitting corresponds to low generalization ability.

### Model Three. GridsearchCV, LemmaTokenizer

In [24]:
#make logistic pipeline
cvect_lemma = CountVectorizer(stop_words = stop_word,tokenizer=LemmaTokenizer())
pipe_lemma = make_pipeline(cvect_lemma,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))

In [25]:
#parameter grid for gridsearch
param_log = {
    'countvectorizer__max_features':[3_500,4_000,5_500],
    'countvectorizer__ngram_range': [(1,1),(1,2)],
    'logisticregression__C': [.0001,.001,.1,1]
}

In [26]:
#instantiate grid search over lemmatokenizer
grid_lemma = GridSearchCV(pipe_lemma,param_grid=param_log,n_jobs=-1)

#fit training data to the grid
grid_lemma.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [27]:
grid_lemma.score(X_train,y_train)

0.861510353227771

In [28]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_lemma.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_lemma.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_lemma.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_lemma.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7282.
The recall of the Logistic Regression model is 0.8386.
The f1 score of the Logistic Regression model is 0.778.
The precision score of the Logistic Regression model is 0.7257.


In [29]:
#cast to a dataframe
pd.DataFrame(grid_lemma.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_countvectorizer__max_features,param_countvectorizer__ngram_range,param_logisticregression__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,18.696713,0.973825,3.762837,0.101393,3500,"(1, 1)",0.001,"{'countvectorizer__max_features': 3500, 'count...",0.739951,0.744214,0.733252,0.735688,0.72838,0.736297,0.005447,1
9,15.409699,1.760176,2.860427,0.895036,4000,"(1, 1)",0.001,"{'countvectorizer__max_features': 4000, 'count...",0.739342,0.741169,0.737515,0.728989,0.729598,0.735323,0.00506,2
5,17.045594,1.990009,3.63074,0.413232,3500,"(1, 2)",0.001,"{'countvectorizer__max_features': 3500, 'count...",0.733861,0.737515,0.719854,0.742996,0.725335,0.731912,0.008327,3
13,7.492648,0.762517,1.578648,0.197762,4000,"(1, 2)",0.001,"{'countvectorizer__max_features': 4000, 'count...",0.735079,0.733861,0.721072,0.742996,0.726553,0.731912,0.007522,4
21,7.09269,0.131392,1.514986,0.070137,5500,"(1, 2)",0.001,"{'countvectorizer__max_features': 5500, 'count...",0.735688,0.730816,0.725335,0.74056,0.719854,0.730451,0.007324,5


## Model Interpretation

Model Three scored very similarly to Model Two. The lemmatization of words did not improve accuracy of the model. The lemmatization of words made Model Three more computationall expensive than Model Two, it took longer to run. This trend was similar to that seen in the MultinomialNB models.

### Model Four. GridSearchCV, Stemmatize

In [30]:
#make logistic pipeline
cvect_stem = CountVectorizer(stop_words = stop_word,tokenizer=StemTokenizer())
pipe_stem = make_pipeline(cvect_stem,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))

In [31]:
#instantiate gridsearch through stemmatizer
grid_stem = GridSearchCV(pipe_stem,param_grid=param_log,n_jobs=-1)

#fit training data to gridsearch
grid_stem.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [32]:
#score stemmatizer on training data
grid_stem.score(X_train,y_train)

0.8700365408038977

In [33]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_stem.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_stem.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_stem.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_stem.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7293.
The recall of the Logistic Regression model is 0.8219.
The f1 score of the Logistic Regression model is 0.7753.
The precision score of the Logistic Regression model is 0.7336.


In [34]:
#cast to a dataframe
pd.DataFrame(grid_stem.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_countvectorizer__max_features,param_countvectorizer__ngram_range,param_logisticregression__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,25.912526,12.765136,6.075865,3.077493,4000,"(1, 2)",0.001,"{'countvectorizer__max_features': 4000, 'count...",0.727162,0.723508,0.733252,0.745432,0.733861,0.732643,0.007469,1
5,23.516339,9.557748,5.525057,2.220415,3500,"(1, 2)",0.001,"{'countvectorizer__max_features': 3500, 'count...",0.727162,0.722899,0.730816,0.74056,0.730816,0.730451,0.005836,2
1,15.380083,0.597279,3.692986,0.159305,3500,"(1, 1)",0.001,"{'countvectorizer__max_features': 3500, 'count...",0.727162,0.724726,0.726553,0.737515,0.720463,0.727284,0.005627,3
9,30.351191,10.547568,7.262039,2.30521,4000,"(1, 1)",0.001,"{'countvectorizer__max_features': 4000, 'count...",0.733861,0.721072,0.727771,0.738124,0.714982,0.727162,0.008377,4
17,26.460594,10.074485,5.831737,2.772926,5500,"(1, 1)",0.001,"{'countvectorizer__max_features': 5500, 'count...",0.735079,0.724117,0.724726,0.727162,0.714982,0.725213,0.006438,5


### Model Interpretation

Model Four performed simlarly to Model Three. There are no differences, except for runtime. Model Four took a longer time to fit than Model Three. Overfitting is still present.

## Logistic Regression with TfidfVectorization 

### Model One. GridSearchCV, no Lemmatizing or Stemmatizing

In [35]:
#create pipeline for basic tfidf
tfidf = TfidfVectorizer(stop_words = stop_word)
pipe_tfdf = make_pipeline(tfidf,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))

In [36]:
#create param grid for tfidf

param_tfdf = {
    'tfidfvectorizer__max_features': [2_400,2_500,2_600],
    'tfidfvectorizer__ngram_range': [(1,1),(1,2)],
    'logisticregression__C': [.0005,.0006,.0004]
}

In [37]:
#instantiate gridsearch for tfidf no lemmatizing or stemmatizing
grid_tfdf = GridSearchCV(pipe_tfdf,param_grid=param_tfdf,n_jobs=-1)

#fit on training data
grid_tfdf.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [38]:
#score on the training data
grid_tfdf.score(X_train,y_train)

0.8515225334957369

In [39]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_tfdf.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_tfdf.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_tfdf.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_tfdf.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7435.
The recall of the Logistic Regression model is 0.8296.
The f1 score of the Logistic Regression model is 0.7861.
The precision score of the Logistic Regression model is 0.747.


In [40]:
#best parameters of the gridsearch
grid_tfdf.best_params_

{'logisticregression__C': 0.0005,
 'tfidfvectorizer__max_features': 2600,
 'tfidfvectorizer__ngram_range': (1, 2)}

## Model Interpretation

Model One with TfidfVectorization also scored the same to the models with CountVectorization. TfidfVectorization measures the importance of a word based on how many times a word appears in a document versus the number of documents that word appears in. This way of numerically categorizing words did not improve the accuracy or recall.

### Model Two. TfidfVectorization with Lemmatization

In [41]:
#create pipeline for basic tfidf
tfidf_lemma = TfidfVectorizer(stop_words = stop_word,tokenizer = LemmaTokenizer())
pipe_lemma = make_pipeline(tfidf_lemma,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))
param_tfdf_lemma = {
    'tfidfvectorizer__max_features': [1_000,2_500,3_000],
    'tfidfvectorizer__ngram_range': [(1,1),(1,2)],
    'logisticregression__C': [.0005,.0001,.1,10]
}

In [42]:
#instantiate gridsearch for tfidf no lemmatizing or stemmatizing
grid_tfdf_lemma = GridSearchCV(pipe_lemma,param_grid=param_tfdf_lemma,n_jobs=-1)

#fit on training data
grid_tfdf_lemma.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [43]:
#score on the training data
grid_tfdf_lemma.score(X_train,y_train)

0.86394640682095

In [44]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_tfdf_lemma.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_tfdf_lemma.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_tfdf_lemma.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_tfdf_lemma.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7479.
The recall of the Logistic Regression model is 0.8296.
The f1 score of the Logistic Regression model is 0.789.
The precision score of the Logistic Regression model is 0.7522.


In [45]:
grid_tfdf_lemma.best_params_

{'logisticregression__C': 0.0005,
 'tfidfvectorizer__max_features': 3000,
 'tfidfvectorizer__ngram_range': (1, 2)}

### Model Interpretation

Adding lemmatization to the TfidfVectorization model did not significantly improve the score of the model. This model took longer to run, more computationally costly.

### Model Two. TfidfVectorization with Stemmatization

In [46]:
#create pipeline for basic tfidf
tfidf_stem = TfidfVectorizer(stop_words = stop_word,tokenizer = StemTokenizer())
pipe_stem = make_pipeline(tfidf_stem,StandardScaler(with_mean=False),LogisticRegression(max_iter=10_000))
param_stem = {
    'tfidfvectorizer__max_features': [750,1_000,1_500],
    'tfidfvectorizer__ngram_range':[(1,1),(1,2)],
    'logisticregression__C':[.0001,.001,.1]
}

In [47]:
#instantiate gridsearch for tfidf no lemmatizing or stemmatizing
grid_tfdf_stem = GridSearchCV(pipe_stem,param_grid=param_stem,n_jobs=-1)

#fit on training data
grid_tfdf_stem.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(stop_words=['0o', '0s',
                                                                    '3a', '3b',
                                                                    '3d', '6b',
                                                                    '6o', 'a',
                                                                    'a1', 'a2',
                                                                    'a3', 'a4',
                                                                    'ab',
                                                                    'able',
                                                                    'about',
                                                                    'above',
                                                                    'abst',
                                                                    'ac',
        

In [48]:
#score on the training data
grid_tfdf_stem.score(X_train,y_train)

0.8320341047503045

In [49]:
#score on the classification metrics
print(f'The accuracy of the Logistic Regression model is {round(grid_tfdf_stem.score(X_test,y_test),4)}.')
print(f'The recall of the Logistic Regression model is {round(recall_score(y_test,grid_tfdf_stem.predict(X_test)),4)}.')
print(f'The f1 score of the Logistic Regression model is {round(f1_score(y_test,grid_tfdf_stem.predict(X_test)),4)}.')
print(f'The precision score of the Logistic Regression model is {round(precision_score(y_test,grid_tfdf_stem.predict(X_test)),4)}.')

The accuracy of the Logistic Regression model is 0.7402.
The recall of the Logistic Regression model is 0.8193.
The f1 score of the Logistic Regression model is 0.7818.
The precision score of the Logistic Regression model is 0.7477.


In [50]:
#best parameters
grid_tfdf_stem.best_params_

{'logisticregression__C': 0.001,
 'tfidfvectorizer__max_features': 1500,
 'tfidfvectorizer__ngram_range': (1, 2)}

### Model Interpretation

Stemmatizing the words, as seen in this model, did not improve the TfidfVectorization metrics. Stemmatizing is more computationally heavy and took a long time to run. Therefore, this model was not taken any farther.