In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("./datasets/combined_df.csv")

In [3]:
df.head()

Unnamed: 0,names,titles,t_tokenized,t_stopped,t_final,posts,p_tokenized,p_stopped,p_final,subreddit,final_combined
0,t3_dczhrz,URL not found,"['url', 'not', 'found']","['url', 'found']","['url', 'found']",Had a customer raise a ticket today which said...,"['had', 'a', 'customer', 'raise', 'a', 'ticket...","['customer', 'raise', 'ticket', 'today', 'said...","['customer', 'raise', 'ticket', 'today', 'said...",talesfromtechsupport,"['url', 'found', 'customer', 'raise', 'ticket'..."
1,t3_db4quc,Screen Protector Did Its Job!,"['screen', 'protector', 'did', 'its', 'job']","['screen', 'protector', 'job']","['screen', 'protector', 'job']",I work at a cell phone retail store. Someone c...,"['i', 'work', 'at', 'a', 'cell', 'phone', 'ret...","['work', 'cell', 'phone', 'retail', 'store', '...","['work', 'cell', 'phone', 'retail', 'store', '...",talesfromretail,"['screen', 'protector', 'job', 'work', 'cell',..."
2,t3_cm7nr1,My dear coworker...,"['my', 'dear', 'coworker']","['dear', 'coworker']","['dear', 'coworker']",I do (or coordinate) all the tech where I work...,"['i', 'do', 'or', 'coordinate', 'all', 'the', ...","['coordinate', 'tech', 'work', 'small', 'part'...","['coordinate', 'tech', 'work', 'small', 'part'...",talesfromtechsupport,"['dear', 'coworker', 'coordinate', 'tech', 'wo..."
3,t3_cyh93q,"""No that's not turquoise""","['no', 'thats', 'not', 'turquoise']","['thats', 'turquoise']","['thats', 'turquoise']","So this happened a few weeks ago, and I was so...","['so', 'this', 'happened', 'a', 'few', 'weeks'...","['happened', 'weeks', 'ago', 'dumbfounded', 'e...","['happened', 'weeks', 'ago', 'dumbfounded', 'e...",talesfromretail,"['thats', 'turquoise', 'happened', 'weeks', 'a..."
4,t3_dah9xr,We broker her Laptop,"['we', 'broker', 'her', 'laptop']","['broker', 'laptop']","['broker', 'laptop']",This is a old one but as it is some time ago i...,"['this', 'is', 'a', 'old', 'one', 'but', 'as',...","['old', 'one', 'time', 'ago', 'recall', 'highl...","['old', 'one', 'time', 'ago', 'recall', 'highl...",talesfromtechsupport,"['broker', 'laptop', 'old', 'one', 'time', 'ag..."


#### Naive-Bayes Classifier (Multinomial Model) with Count-Vectorizer

First, we turn `subreddit` into a 1/0 column, where 1 indicates `talesfromretail`.

In [4]:
df['talesfromtechsupport'] = [1 if df.loc[i,'subreddit'] == 'talesfromtechsupport' else 0 for i in range(df.shape[0])]

In [5]:
df['talesfromtechsupport'].value_counts()

1    976
0    442
Name: talesfromtechsupport, dtype: int64

Here, we will split our data into `X` and `y`. Note that we will be predicting our subreddit posts from the titles and posts combined and see how well our model works in predicting with this parameter.

In [6]:
X = df['final_combined']
y = df['talesfromtechsupport']

Then, then we do a train-test-split to split our data into training and testing sets.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

We then can instantiate CountVectorizer and instantiate our NB pipeline.

In [8]:
naive_bayes = Pipeline([('cvec', CountVectorizer()), 
                        ('multi_nb', MultinomialNB())])

As seen above, we will use default values for our BaseLine NB model first.

We will fit our training data first to our NB pipeline.

In [9]:
naive_bayes.fit(X_train,y_train) 

Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multi_nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

Then we go on to predict y using X_test.

In [10]:
naive_ypred = naive_bayes.predict(X_test) 

We then can check our scores against our training and testing sets above. 

In [15]:
naive_bayes.score(X_train, y_train)

0.9952963311382879

In [16]:
naive_bayes.score(X_test, y_test)

0.9774647887323944

Both our training and testing scores are similar and does not show any over-fitting of any sort for now. However, we can use GridSearchCV to help us search for the best parameters for our NB model in the next portion.

#### Naive-Bayes Classifier (Multinomial Model) GridSearchCV, Count-Vectorizer

First, we initialize the pipe parameters to be fed into our GridSearchCV as shown below.

In [20]:
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [21]:
nb_grid = GridSearchCV(naive_bayes,
                       param_grid=pipe_params,
                       scoring='accuracy'
                      )

Fitting our training data to GridSearchCV model....

In [22]:
nb_grid.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor...=None, vocabulary=None)), ('multi_nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__max_features': [2500, 3000, 3500], 'cvec__min_df': [2, 3], 'cvec__max_df': [0.9, 0.95], 'cvec__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [23]:
print(nb_grid.best_score_)

0.9783631232361242


In [24]:
nb_grid.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 3000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2)}

We can now instantiate NB model with best parameters for CountVectorizer.

In [57]:
naive_bayes = Pipeline([('cvec', CountVectorizer(ngram_range = (1,2),
                                                 max_features = 3000,
                                                 max_df = 0.9,
                                                 min_df = 2)), 
                        ('multi_nb', MultinomialNB())])

In [58]:
naive_bayes.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=3000, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multi_nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [59]:
print(f'Score on training set: {naive_bayes.score(X_train, y_train)}')
print(f'Score on testing set: {naive_bayes.score(X_test, y_test)}')

Score on training set: 0.9887111947318908
Score on testing set: 0.9690140845070423


#### Naive-Bayes Classifier (Multinomial Model) GridSearchCV, Tfidf-Vectorizer


Here we use Tfidf-Vectorizer instead of Count-Vectorizer and see what impact it has on our data set.

### Why Use TF-IDF? 
1. Common words are penalized. <br>
2. Rare words have more influence.

In [29]:
naive_bayes = Pipeline([('vector',TfidfVectorizer()), 
                        ('multi_nb', MultinomialNB())]) 

In [30]:
pipe_params = {
    'vector__max_df':[0.9,0.95],
    'vector__min_df':[0.0001,0.001,0.01],
    'vector__ngram_range':[(1,1),(1,2),(1,3),(1,4),(1,5)]
}

In [31]:
nb_grid = GridSearchCV(naive_bayes,
                       param_grid=pipe_params,
                       scoring='accuracy'
                      )

In [33]:
nb_grid.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vector', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...        vocabulary=None)), ('multi_nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vector__max_df': [0.9, 0.95], 'vector__min_df': [0.0001, 0.001, 0.01], 'vector__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [34]:
nb_grid.best_params_

{'vector__max_df': 0.9, 'vector__min_df': 0.01, 'vector__ngram_range': (1, 2)}

In [35]:
nb_grid.best_score_

0.9482596425211665

Similar to above, we instantiate a new NB model with our best parameters.

In [41]:
naive_bayes = Pipeline([('vector',TfidfVectorizer(ngram_range=(1,2),
                                                     min_df=0.01,
                                                     max_df=0.9)),
                            ('multi_nb', MultinomialNB())
                           ])

In [None]:
naive_bayes.