# Logistic regression

In [1]:
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd
from helper_methods import load_data
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

In [2]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

In [3]:
df_train, y_train, ids_train, features_set = load_data(train=True, engineering=True, standardizer=stdsc)

In [4]:
StatifiedCV = StratifiedKFold(y_train, n_folds=10, shuffle=True)

In [5]:
scores = []
lg = LogisticRegression()
param_grid = dict(C=np.logspace(-3, 3, base=10, num=20), penalty=['l1', 'l2'])
for i, features in enumerate(features_set):
    grid = GridSearchCV(lg, param_grid, cv=StatifiedCV, scoring='log_loss')
    grid.fit(df_train[features].values, y_train)
    scores.append((-grid.best_score_, i, grid.best_params_))
    print i, grid.best_score_, grid.best_params_, len(features)

0 -0.489659942129 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
1 -0.495913433924 {'penalty': 'l2', 'C': 0.69519279617756058, 'fit_intercept': True} 3
2 -0.491591196651 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
3 -0.495551413688 {'penalty': 'l2', 'C': 0.69519279617756058, 'fit_intercept': True} 3
4 -0.502874494743 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
5 -0.497922355845 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
6 -0.503032121377 {'penalty': 'l1', 'C': 0.33598182862837811, 'fit_intercept': True} 3
7 -0.489099528039 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
8 -0.502304219267 {'penalty': 'l2', 'C': 0.69519279617756058, 'fit_intercept': True} 3
9 -0.485204546694 {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True} 3
10 -0.486984799959 {'penalty': 'l2', 'C': 0.69519279617756058, 'fit_intercept': True} 3
11 -0.505314170156 {'penalty': 'l2', 'C': 

### Set the optimal hyperparameters, refit using the training set, and feature set

In [7]:
scores.sort()
optimal_features = features_set[scores[0][1]]
optimal_params = scores[0][2]
clf = lg.set_params(**optimal_params).fit(df_train[optimal_features].values, y_train)

LogisticRegression(C=0.33598182862837811, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [9]:
print optimal_features
print sorted(scores)

['months_since_last', 'number_of_donations', 'months_since_first', 'last_to_first']
[(0.48350128503398371, 20, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48388542034873766, 36, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48473058282673864, 22, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48478468730900887, 41, {'penalty': 'l2', 'C': 0.16237767391887209, 'fit_intercept': True}), (0.48497206286307182, 37, {'penalty': 'l2', 'C': 0.16237767391887209, 'fit_intercept': True}), (0.48520454669431451, 9, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48532984544812341, 35, {'penalty': 'l2', 'C': 0.16237767391887209, 'fit_intercept': True}), (0.48649037529807376, 29, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48665982984639888, 25, {'penalty': 'l2', 'C': 0.33598182862837811, 'fit_intercept': True}), (0.48697193822051549, 28, {'penalty': 'l1', 'C': 0.3359818286

### Load test data, predict the probability and write to csv

In [10]:
df_test, y_dummy, ids_test, features_set_dummy = load_data(train=False, engineering=True, standardizer=stdsc)

In [11]:
X_test = df_test[optimal_features].values
y_pred_prob = clf.predict_proba(X_test)[:,1]
df_out = pd.DataFrame({'':ids_test, 'Made Donation in March 2007':y_pred_prob})
df_out.to_csv('logistic_regression.csv', index=False, header=True)