In [1]:
#%% Import packages
import os, sys

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
                            f1_score
from sklearn.naive_bayes import MultinomialNB

In [2]:
# adding custom path for importing custom defs
module_path = os.path.abspath(os.path.join('..', '..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
#%% Import custom packages
from my_defs.model import *
from my_defs.text import *

In [4]:
#%% Import Datasets
train_df = pd.read_csv('../input_data/train.csv')
test_df = pd.read_csv('../input_data/test.csv')
submission_df = pd.read_csv('../input_data/sample_submission.csv')

In [5]:
#%%
train_df['text_processed'] = text_cleanup(train_df['text'])

In [6]:
X_train, X_test, y_train, y_test = \
    train_test_split(train_df['text_processed'], train_df['target'], \
                     test_size=0.25, random_state=33)

In [26]:
#%%

config1 = [('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())]
config2 = [('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())]
config3 = [('vectorizer', CountVectorizer()), ('classifier', MultinomialNB(class_prior=(0.57, 0.43)))]
config4 = [('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB(class_prior=(0.55, 0.45)))]

for config in (config1, config2, config3, config4):
    print('-------')
    trial = Pipeline(config)
    clf = train(trial, X_train, y_train, X_test, y_test, score_training_data=True)
# %%

#test_df['target'] = clf.predict(text_cleanup(test_df['text']))
#test_df[['id', 'target']].to_csv('../submissions/submission_naivebayes.csv', \
#                                 index=False)

-------
   Accuracy  Precision     Recall         F1
--------------------------------------------
    0.93449    0.95050    0.89264    0.92066 <-- train
    0.78099    0.78088    0.70000    0.73823  <-- test
-------
   Accuracy  Precision     Recall         F1
--------------------------------------------
    0.91049    0.96467    0.81983    0.88637 <-- train
    0.77941    0.83546    0.62262    0.71351  <-- test
-------
   Accuracy  Precision     Recall         F1
--------------------------------------------
    0.93449    0.95011    0.89305    0.92070 <-- train
    0.77941    0.77778    0.70000    0.73684  <-- test
-------
   Accuracy  Precision     Recall         F1
--------------------------------------------
    0.91680    0.96002    0.83957    0.89576 <-- train
    0.77836    0.81287    0.64643    0.72016  <-- test


In [19]:
train_df['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64