In [21]:
from typing import Dict, List, Optional, Tuple
import os
import numpy as np
import matplotlib.pyplot as plt
import utils
from scipy import stats

seed = 0
np.random.seed(seed)

from utils import TrainAndTestData, empirical_err
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


In [44]:
Xr, yr = utils.read_sms_spam_data(os.path.join(os.getcwd(), 'data/SMSSpamCollection'))
spamdata = TrainAndTestData(*utils.create_split(Xr, yr, 0.7))

In [37]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import GridSearchCV

Vectorizer = utils.SMS_Vectorizer(spamdata.X_train, d=360)
X = Vectorizer.vectorize(Xr)
y = yr
# vectorized_data = TrainAndTestData(Vectorizer.vectorize(spamdata.X_train), spamdata.y_train, Vectorizer.vectorize(spamdata.X_test), spamdata.y_test)
param_grid = [
    {'max_depth': range(5, 30, 2), 
     'max_leaf_nodes': range(600, 1501, 100),
     'criterion': ["gini", "entropy", "log_loss"],
     "max_features": ["sqrt", "log2", None]
    }
]
base_estimator = DecisionTreeClassifier()
sh = GridSearchCV(base_estimator, param_grid, cv = 5, verbose = 3 ,error_score="raise").fit(X, y)

Fitting 5 folds for each of 1170 candidates, totalling 5850 fits
[CV 1/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=600;, score=0.923 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=600;, score=0.929 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=600;, score=0.908 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=600;, score=0.903 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=600;, score=0.903 total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=700;, score=0.901 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=700;, score=0.896 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=5, max_features=sqrt, max_leaf_nodes=700;, score=0.879 total time=   0.0s
[CV 4/5] END criterion=gini, ma

In [38]:
import pandas as pd
df = pd.DataFrame(sh.cv_results_)
df.sort_values(by="rank_test_score")
df.head()

In [43]:


clf = sh.best_estimator_


vectorized_public_X = Vectorizer.vectorize(spamdata.X_test)
public_y = spamdata.y_test
private_X, private_y = utils.read_sms_spam_data(os.path.join(os.getcwd(), 'data/SMSSpamCollection_test_text'))
vectorized_private_X = Vectorizer.vectorize(private_X)
X = np.concatenate([vectorized_public_X, vectorized_private_X])
pred_y = clf.predict(X)

with open('prediction.csv', 'w') as f:
    f.write('ID,LABEL\n')
    for i, y in enumerate(pred_y):
        f.write(f"{i},{str(y)}\n")


(1833, 360)


In [51]:
vectorized_public_X = Vectorizer.vectorize(spamdata.X_test)
public_y = spamdata.y_test
private_X, private_y = utils.read_sms_spam_data(os.path.join(os.getcwd(), 'data/SMSSpamCollection_test_text'))
vectorized_private_X = Vectorizer.vectorize(private_X)
X = np.concatenate([vectorized_public_X, vectorized_private_X])

est = DecisionTreeClassifier(max_depth=13, max_leaf_nodes=1000)
est.fit(Vectorizer.vectorize(spamdata.X_train), spamdata.y_train)
pred_y = est.predict(X)


In [53]:
with open('prediction.csv', 'w') as f:
    f.write('ID,LABEL\n')
    for i, y in enumerate(pred_y):
        f.write(f"{i},{str(y)}\n")