## Read me
This notebook uses the outputs of several other notebooks (which are also submitted) as inputs:  
1) Training and test data with manual features;  
2) Training and test data with LDA-based features.

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')
import textblob

## Import Data

In [2]:
%%time
#training and test data with manual features, imported from other notebooks
file_feat = '../input/train-all-data-feat/df_train_all_feat.csv' 
new_15_feat = '../input/feateng-output95/df_outp95_feat.csv' #training data based on ‘Pseudo-Labelling’ method
test_feat = '../input/df-test-feat/df_test_feat.csv'

#training and test data with LDA-based features, imported from other notebooks
file_lda_feat = '../input/feateng-list-0404/df_lda_feat.csv'
new_15_lda_feat = '../input/feateng-list-0404/df_15_new_lda_feat.csv'
test_lda_feat = '../input/feateng-list-0404/df_test_lda_feat.csv'

df_feat = pd.read_csv(file_feat)
df_new_15_feat = pd.read_csv(new_15_feat)
df_test_feat = pd.read_csv(test_feat)

df_lda = pd.read_csv(file_lda_feat)
df_new_15_lda = pd.read_csv(new_15_lda_feat)
df_test_lda = pd.read_csv(test_lda_feat)

CPU times: user 32.5 s, sys: 3.5 s, total: 36 s
Wall time: 1min 1s


In [3]:
print(df_feat.shape, df_new_15_feat.shape, df_test_feat.shape)  
print(df_lda.shape, df_new_15_lda.shape, df_test_lda.shape)     

(955454, 40) (422080, 14) (552735, 41)
(955454, 21) (422080, 21) (552735, 20)


## Organize Data Files
Manual features are created for the training and test data at different stages of the project, so different files contains differnt number of columns. The codes below is to select common columns among the files and concatenate the dataframes where needed.

In [4]:
common_cols1 = list(set(df_feat.columns & df_new_15_feat.columns & df_test_feat.columns))
common_cols2 = list(set(df_lda.columns & df_new_15_lda.columns & df_test_lda.columns))

In [5]:
df_1 = df_feat[common_cols1+['Outcome']]
df_new_15_1 = df_new_15_feat[common_cols1+['Outcome']]
df_test_1 = df_test_feat[common_cols1]

df_2 = df_lda[common_cols2]
df_new_15_2 = df_new_15_lda[common_cols2]
df_test_2 = df_test_lda[common_cols2]

print(df_1.shape, df_new_15_1.shape, df_test_1.shape)  
print(df_2.shape, df_new_15_2.shape, df_test_2.shape)  

(955454, 14) (422080, 14) (552735, 13)
(955454, 20) (422080, 20) (552735, 20)


In [6]:
df = pd.merge(df_1, df_2, on='Id')
df_new_15 = pd.merge(df_new_15_1, df_new_15_2, on='Id')
df_test = pd.merge(df_test_1, df_test_2, on='Id')
print(df.shape, df_new_15.shape, df_test.shape)  

(955454, 33) (422080, 33) (552735, 32)


In [7]:
drop_columns = ['Text_y', 'Id', 'Split_x', 'Split_y', 'Unnamed: 0']
df = df.drop(columns = drop_columns, axis=1)
df_new_15 = df_new_15.drop(columns = drop_columns, axis=1)
df_test = df_test.drop(columns = drop_columns, axis=1)
print(df.shape, df_new_15.shape, df_test.shape)  

(955454, 28) (422080, 28) (552735, 27)


In [8]:
df = pd.concat([df,df_new_15], ignore_index=True)
df = df.reset_index().drop(columns=['index'])
print(df.shape)
df.head()

(1377534, 28)


Unnamed: 0,adj_density,num_word_count,verb_count,word_count,pron_count,pron_density,num_char_count,Text_x,adj_count,num_char_density,...,lda_1,lda_13,lda_0,lda_3,lda_7,lda_10,lda_5,lda_6,lda_4,lda_15
0,0.22,0,6,49,2,0.04,5,I am having a problem with the first example o...,11,0.102041,...,0,0,0,0,1,0,0,0,0,0
1,0.236842,0,9,37,2,0.052632,2,"everyone, I met a tough definite integral as f...",9,0.054054,...,0,0,0,0,0,0,0,0,0,0
2,0.22449,3,11,48,2,0.040816,3,"Please dont lynch me, but i've never sat throu...",11,0.0625,...,0,0,0,0,0,4,0,0,0,0
3,0.125,0,1,7,0,0.0,2,How to calculate $ \mathbb{Z}[x] /\langle2x-1\...,1,0.285714,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0,8,36,5,0.135135,0,When somebody rings or texts my iPhone it is n...,0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Inputs

In [9]:
seed = 2021
valid_ratio = 0.3
n_sample_tuning = 100000

cv = 5
num_eval = 5 

DEBUG = False
if DEBUG:
    df = df[:1000]
    df_test = df_test[:1000]
    cv = 2
    num_eval = 2

In [10]:
target = 'Outcome'

features = [
            'adj_density',    
            'lda_0', 
            'lda_1', 
            'lda_10', 
            'lda_11', 
            'lda_12', 
            'lda_13', 
            'lda_14', 
            'lda_15', 
            'lda_2', 
            'lda_3', 
            'lda_4', 
            'lda_5', 
            'lda_6', 
            'lda_7', 
            'lda_8', 
            'lda_9', 
            'num_char_density', 
            'num_word_count',   
            'pron_density', 
            'verb_density', 
           ]

## Get Balanced Data for Training

In [11]:
def get_samples(df,n, replace):
    df = df.sample(n=n,replace=True,random_state=2021,axis=0)
    return df

In [12]:
df_temp = pd.DataFrame()
for i in range(df.Outcome.nunique()):
    df_i = df[df.Outcome == i]
    if df_i.shape[0] > n_sample_tuning:
        df_i = get_samples(df_i, n_sample_tuning, replace=False)
    else:
        df_i = get_samples(df_i, n_sample_tuning, replace=True)
    #print(i, df_i.shape)
    df_temp = pd.concat([df_temp, df_i], ignore_index=True)
df = df_temp
print(df.shape)

(1600000, 28)


## Model

In [13]:
from sklearn.utils import shuffle
df['Outcome'] = df['Outcome'].astype('int64')
df = shuffle(df).reset_index().drop(columns=['index'])

In [14]:
for col in [x for x in df.columns if x!='Text_x']:
    df[col] = df[col].astype('float64') #transform to avoid error

In [15]:
#for optimization use
#split train and valid data
"""
n_split = int((1-valid_ratio)*df.shape[0])
print(df.shape[0], n_split)
df_train = df[:n_split]
df_valid = df[n_split:]
"""

'\nn_split = int((1-valid_ratio)*df.shape[0])\nprint(df.shape[0], n_split)\ndf_train = df[:n_split]\ndf_valid = df[n_split:]\n'

In [16]:
X_train_vec = df['Text_x']
X_train_feat = df[features]
X_train = df[features + ['Text_x']]
y_train = df[target]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df = 2, 
                       token_pattern = r"\b\w\w+\b") #parameter obtained from optimization
vect

CountVectorizer(min_df=2, token_pattern='\\b\\w\\w+\\b')

In [18]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha = 0.23955062740611444) #parameter obtained from optimization

In [19]:
# define a function that accepts a DataFrame returns the ingredients string
def get_text(df):
    return df.Text_x

In [20]:
# define a function that accepts a DataFrame returns the manually created features
def get_manual(df):
    return df[features]

In [21]:
# create a pipeline of vectorization and Naive Bayes
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.preprocessing import FunctionTransformer
get_text_ft = FunctionTransformer(get_text, validate=False)
get_manual_ft = FunctionTransformer(get_manual, validate=False)

In [22]:
union = make_union(make_pipeline(get_text_ft, vect), get_manual_ft) 

In [23]:
#for optimization use
"""
X_dtm_manual = union.fit_transform(df_train)
nb.fit(X_dtm_manual, df_train.Outcome)

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from time import time
param_hyperopt= {
    'countvectorizer__token_pattern': hp.choice('countvectorizer__token_pattern', [r"\b\w\w+\b", r"'([a-z ]+)'"]),
    'countvectorizer__min_df':        hp.choice('countvectorizer__min_df', np.arange(1, 5, 1, dtype=int)), 
    'multinomialnb__alpha':           hp.uniform('multinomialnb__alpha', 0.0, 1.0)                                   
    }
"""

'\nX_dtm_manual = union.fit_transform(df_train)\nnb.fit(X_dtm_manual, df_train.Outcome)\n\nfrom hyperopt import fmin, tpe, hp, STATUS_OK, Trials\nfrom time import time\nparam_hyperopt= {\n    \'countvectorizer__token_pattern\': hp.choice(\'countvectorizer__token_pattern\', [r"\x08\\w\\w+\x08", r"\'([a-z ]+)\'"]),\n    \'countvectorizer__min_df\':        hp.choice(\'countvectorizer__min_df\', np.arange(1, 5, 1, dtype=int)), \n    \'multinomialnb__alpha\':           hp.uniform(\'multinomialnb__alpha\', 0.0, 1.0)                                   \n    }\n'

In [24]:
#for optimization use
"""
def hyperopt(param_space, X_train, y_train, num_eval):
    
    start = time()
    
    # defin the object function
    def objective_function(params):
        clf = nb.set_params(**params) ### since pipelien object is not callable
        score = cross_val_score(clf, X_train, y_train, cv=cv).mean()
        return {'loss': -score, 'status': STATUS_OK}

    trials = Trials()

    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, # you can change other algorithms such as GP,
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(1))
    
    loss = [x['result']['loss'] for x in trials.trials]
    
    best_param_values = [x for x in best_param.values()]
    print(best_param_values)
    
    if best_param_values[1] == 0:
        token_type = r"\b\w\w+\b"
    else:
        token_type = r"'([a-z ]+)'"
    
    clf_best = pipe.set_params(countvectorizer__token_pattern=token_type,
                    countvectorizer__min_df=int(best_param_values[0]),
                    multinomialnb__alpha=float(best_param_values[2]))
                                  
    clf_best.fit(X_train, y_train)
    
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss)*-1)
    print("Best parameters: ", best_param)
    print("Time elapsed: ", time() - start)
    print("Parameter combinations evaluated: ", num_eval)
    
    return trials, clf_best
"""

'\ndef hyperopt(param_space, X_train, y_train, num_eval):\n    \n    start = time()\n    \n    # defin the object function\n    def objective_function(params):\n        clf = nb.set_params(**params) ### since pipelien object is not callable\n        score = cross_val_score(clf, X_train, y_train, cv=cv).mean()\n        return {\'loss\': -score, \'status\': STATUS_OK}\n\n    trials = Trials()\n\n    best_param = fmin(objective_function, \n                      param_space, \n                      algo=tpe.suggest, # you can change other algorithms such as GP,\n                      max_evals=num_eval, \n                      trials=trials,\n                      rstate= np.random.RandomState(1))\n    \n    loss = [x[\'result\'][\'loss\'] for x in trials.trials]\n    \n    best_param_values = [x for x in best_param.values()]\n    print(best_param_values)\n    \n    if best_param_values[1] == 0:\n        token_type = r"\x08\\w\\w+\x08"\n    else:\n        token_type = r"\'([a-z ]+)\'"\n   

In [25]:
#for optimization use
"""
%%time
results_hyperopt, clf = hyperopt(param_hyperopt, X_dtm_manual, df_train.Outcome, num_eval)"""

'\n%%time\nresults_hyperopt, clf = hyperopt(param_hyperopt, X_dtm_manual, df_train.Outcome, num_eval)'

# Validate

In [26]:
#for optimization use
"""
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report

X_valid_dtm_manual = union.transform(df_valid)
valid_pred = nb.predict(X_valid_dtm_manual)
valid_proba = nb.predict_proba(X_valid_dtm_manual)
y_valid = df_valid.Outcome

valid_precision = precision_score(y_valid, valid_pred, average='micro')
valid_recall = recall_score(y_valid, valid_pred, average='micro')
valid_accuracy = accuracy_score(y_valid, valid_pred)
valid_auc = roc_auc_score(y_valid, valid_proba, multi_class='ovr')

print('valid precision: ', valid_precision)
print('valid recall: ', valid_recall)
print('valid accuracy: ', valid_accuracy)
print('valid auc: ', valid_auc)
print(classification_report(y_valid, valid_pred))
"""

"\nfrom sklearn.metrics import precision_score, recall_score\nfrom sklearn.metrics import roc_auc_score, accuracy_score\nfrom sklearn.metrics import classification_report\n\nX_valid_dtm_manual = union.transform(df_valid)\nvalid_pred = nb.predict(X_valid_dtm_manual)\nvalid_proba = nb.predict_proba(X_valid_dtm_manual)\ny_valid = df_valid.Outcome\n\nvalid_precision = precision_score(y_valid, valid_pred, average='micro')\nvalid_recall = recall_score(y_valid, valid_pred, average='micro')\nvalid_accuracy = accuracy_score(y_valid, valid_pred)\nvalid_auc = roc_auc_score(y_valid, valid_proba, multi_class='ovr')\n\nprint('valid precision: ', valid_precision)\nprint('valid recall: ', valid_recall)\nprint('valid accuracy: ', valid_accuracy)\nprint('valid auc: ', valid_auc)\nprint(classification_report(y_valid, valid_pred))\n"

# Prediction

In [27]:
X_dtm_manual = union.fit_transform(df)
nb.fit(X_dtm_manual, df.Outcome)

MultinomialNB(alpha=0.23955062740611444)

In [28]:
file_test = '../input/bt5153-applied-machine-learning-2021-spring/test.csv'
df_test_pred = pd.read_csv(file_test)
df_test['Text_x'] = df_test_pred['Text']

In [29]:
for col in [x for x in df_test.columns if x!='Text_x']:
    df_test[col] = df_test[col].astype('float64') #transform to avoid error
X_test_dtm_manual = union.transform(df_test)

In [30]:
df_test_pred['Outcome'] = nb.predict(X_test_dtm_manual)
df_test_pred['Outcome'] = df_test_pred['Outcome'].astype('int64')
df_test_pred[['Id', 'Outcome']].to_csv('output.csv', index=False)

In [31]:
df_test_pred[['Id', 'Outcome']].head()

Unnamed: 0,Id,Outcome
0,955455,15
1,955456,15
2,955457,10
3,955458,15
4,955459,9


In [32]:
df_count = df_test_pred['Outcome'].value_counts().reset_index().set_index('index')
df_count

Unnamed: 0_level_0,Outcome
index,Unnamed: 1_level_1
15,316839
10,50418
9,41482
11,31993
6,25943
3,24601
4,21734
7,13006
2,9725
1,8020
