In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import cross_val_score

### Import Data

In [2]:
file_train = '../input/bt5153-applied-machine-learning-2021-spring/train.csv'
file_test = '../input/bt5153-applied-machine-learning-2021-spring/test.csv'

feature = 'Text'
target = 'Outcome'
seed = 2021
valid_ratio = 0.2

cv = 5
num_eval = 5 
score = 'accuracy'

DEBUG = False

In [3]:
df_train = pd.read_csv(file_train)
df_test = pd.read_csv(file_test)

In [4]:
if DEBUG:
    df_train = df_train[:100]
    df_test = df_test[:100]
df_train['Outcome'].value_counts()

14    483982
13     83464
12     64990
9      49192
2      47268
7      42518
8      38590
5      32702
1      31270
0      24428
3      20200
10     16062
11      7168
4       6108
15      5184
6       2328
Name: Outcome, dtype: int64

In [5]:
X_train = df_train.Text
X_test = df_test.Text

y_train = df_train.Outcome
y_test = df_train.Outcome

### Vectorization

In [6]:
# import and instantiate CountVectorizer (with default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

CountVectorizer()

### Model

In [7]:
# import and instantiate Multinomial Naive Bayes (with the default parameters)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

### Pipeline

In [8]:
# create a pipeline of vectorization and Naive Bayes
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)
pipe.steps

[('countvectorizer', CountVectorizer()), ('multinomialnb', MultinomialNB())]

### Tune Hyperparameters (Hyperopt)

In [9]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from time import time
param_hyperopt= {
    'countvectorizer__token_pattern': hp.choice('countvectorizer__token_pattern', [r"\b\w\w+\b", r"'([a-z ]+)'"]),
    'countvectorizer__min_df':        hp.choice('countvectorizer__min_df', np.arange(1, 5, 1, dtype=int)), 
    'multinomialnb__alpha':           hp.uniform('multinomialnb__alpha', 0.0, 1.0)                                   
    }

In [10]:
def hyperopt(param_space, X_train, y_train, num_eval):
    
    start = time()
    
    # defin the object function
    def objective_function(params):
        clf = pipe.set_params(**params) 
        score = cross_val_score(clf, X_train, y_train, cv=cv).mean()
        return {'loss': -score, 'status': STATUS_OK}

    trials = Trials()

    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(1))
    
    loss = [x['result']['loss'] for x in trials.trials]
    
    best_param_values = [x for x in best_param.values()]
    print(best_param_values)
    
    if best_param_values[1] == 0:
        token_type = r"\b\w\w+\b"
    else:
        token_type = r"'([a-z ]+)'"
    
    clf_best = pipe.set_params(countvectorizer__token_pattern=token_type,
                    countvectorizer__min_df=int(best_param_values[0]),
                    multinomialnb__alpha=float(best_param_values[2]))
                                  
    clf_best.fit(X_train, y_train)
    
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss)*-1)
    print("Best parameters: ", best_param)
    print("Time elapsed: ", time() - start)
    print("Parameter combinations evaluated: ", num_eval)
    
    return trials, clf_best

In [11]:
%%time
results_hyperopt, clf = hyperopt(param_hyperopt, X_train, y_train, num_eval)

100%|██████████| 5/5 [13:40<00:00, 164.16s/trial, best loss: -0.916028402221271]
[2, 0, 0.23955062740611444]

##### Results
Score best parameters:  0.916028402221271
Best parameters:  {'countvectorizer__min_df': 2, 'countvectorizer__token_pattern': 0, 'multinomialnb__alpha': 0.23955062740611444}
Time elapsed:  893.864649772644
Parameter combinations evaluated:  5
CPU times: user 14min 41s, sys: 11.9 s, total: 14min 53s
Wall time: 14min 53s


In [12]:
clf

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(min_df=2, token_pattern='\\b\\w\\w+\\b')),
                ('multinomialnb', MultinomialNB(alpha=0.23955062740611444))])

### Prediction

In [13]:
df_test['Outcome'] = clf.predict(X_test)
df_test[['Id', 'Outcome']].to_csv('output.csv', index=False)

In [14]:
df_test[['Id', 'Outcome']]

Unnamed: 0,Id,Outcome
0,955455,15
1,955456,1
2,955457,10
3,955458,15
4,955459,9
...,...,...
552730,1508185,9
552731,1508186,15
552732,1508187,10
552733,1508188,11


In [15]:
df_test['Outcome'].value_counts()

15    263079
10     63246
9      50285
7      47505
11     24714
1      22153
3      21076
4      16442
6      16319
2      12070
12      5533
14      4725
5       1939
13      1311
8       1262
0       1076
Name: Outcome, dtype: int64