In [1]:
from libs.best_performance import *
import pandas as pd

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Find the best model and preprocessing method

In [14]:
params = {
        # ["ridge", "logistic", "random_forest", 'gradient_boosting', 'NN', 'gaussian_process', 'Knn', 'ada_boost']
        'model_name': ["ridge", "logistic"],
        'mix_texts': [False],     # [True, False]
        'clean_texts': [False],   # [True, False]
        'vectorization': ["simple"],     # ["simple", "tfidf"]
        'use_LSA': [False]        # [True, False]
}


best_performance(train_df,test_df,params)

Training model:  ridge
Training model:  logistic


Unnamed: 0,f1_mean,f1_std,model_name,mix_texts,clean_texts,vectorization,use_LSA
0,0.645029,0.028668,logistic,False,False,simple,False
1,0.600088,0.031682,ridge,False,False,simple,False


# Find the best params for the model

In [5]:
train_vectors, test_vectors = preprocess(train_df,
                                         test_df,
                                         params['mix_texts'],
                                         params['clean_texts'],
                                         params['vectorization'],
                                         params['use_LSA'])

In [6]:
X = train_vectors
y = train_df["target"]

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np


pipe = Pipeline([('classifier' , RidgeClassifier())])

# Create param grid.

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RidgeClassifier()],
    'classifier__alpha' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [25]:
# summarize results
print("Best: %f using %s" % (best_clf.best_score_, best_clf.best_params_))
# means = best_clf.cv_results_['mean_test_score']
# stds = best_clf.cv_results_['std_test_score']
# params = best_clf.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.645347 using {'classifier': LogisticRegression(C=0.03359818286283781, solver='liblinear'), 'classifier__C': 0.03359818286283781, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


# Apply them

In [29]:
clf = LogisticRegression(C=0.03359818286283781, solver='liblinear', penalty = 'l2')
clf.fit(X, y)
sample_submission = submission(clf, test_vectors)

   id  target
0   0       0
1   2       0
2   3       0
3   9       0
4  11       0


In [32]:
visual_check(test_df.text, sample_submission["target"].astype(bool))

                                                   text  is_disaster
1585  Tractor-trailers collide on NJ Turnpike å_ 42 ...         True
39    On the #M42 northbound between junctions J3 an...         True
2077  The same UN who stood&amp;watched the Serbs ma...         True
834   @Loraccee @JoeNBC @msnbc Still gets crushed by...         True
350   .@WestmdCountyPA land bank targets first #Latr...         True
                                                   text  is_disaster
2004  It doesn't get any closer. Heavy rain just bar...        False
1365  #IWouldntGetElectedBecause Oh I certainly woul...        False
1066  Madhya Pradesh Train Derailment: Village Youth...        False
724   That @PPact hasn't already collapsed is a test...        False
1230  Not an electric debut for Severino but not a d...        False
