# Deciding to mail or not to mail

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
#Reading data from file. 

datafile_train=r'/Users/Namdeo/Downloads/carvan_train.csv'
datafile_test=r'/Users/Namdeo/Downloads/carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [3]:
#Overview of columns.
cd_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,0
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,0
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,0
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,0
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,0


In [4]:
cd_train.dtypes

V1     int64
V2     int64
V3     int64
V4     int64
V5     int64
       ...  
V82    int64
V83    int64
V84    int64
V85    int64
V86    int64
Length: 86, dtype: object

In [5]:
#After looking at the information about columns come to know V1 and V5 are categorical columns.
#Converting col V1 and V5 into character. 

for col in ['V1','V5']:
    cd_train[col]=pd.Categorical(cd_train[col])
for col in['V1','V5']:
    cd_test[col]=pd.Categorical(cd_test[col])    

In [6]:
#Creating dummies for character columns.
for col in ['V1','V5']:
    
    temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
    cd_train=pd.concat([temp,cd_train],1)
    cd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
    cd_test=pd.concat([temp,cd_test],1)
    cd_test.drop([col],1,inplace=True)

In [7]:
#Cheking null values.
cd_train.isnull().sum().sum(),cd_test.isnull().sum().sum()

(0, 0)

# Preparing data to fit the model

In [8]:
target='V86'

In [9]:
cd_train[target].value_counts()

0    5474
1     348
Name: V86, dtype: int64

In [10]:
x_train=cd_train.drop(target,1)
y_train=cd_train[target]

# Fitting the model

In [11]:
params={'penalty':['l1','l2'],
       'C':np.linspace(0.01,100,10),
       'class_weight':['balanced',None]}

In [13]:
model=LogisticRegression(fit_intercept=True)

In [14]:
gs=GridSearchCV(model,cv=10,param_grid=params,n_jobs=-1,verbose=5,scoring='roc_auc')

In [15]:
gs.fit(x_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 25.2min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.000e-02, 1.112e+01, 2.223e+01, 3.334e+01, 4.445e+01, 5.556e+01,
       6.667e+01, 7.778e+01, 8.889e+01, 1.000e+02]),
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,


In [16]:
gs.best_estimator_

LogisticRegression(C=0.01, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [18]:
report(gs.cv_results_,5)

Model with rank: 1
Mean validation score: 0.74388 (std: 0.03516)
Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}

Model with rank: 2
Mean validation score: 0.73808 (std: 0.03749)
Parameters: {'C': 0.01, 'class_weight': None, 'penalty': 'l2'}

Model with rank: 3
Mean validation score: 0.73052 (std: 0.04303)
Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1'}

Model with rank: 4
Mean validation score: 0.72505 (std: 0.02819)
Parameters: {'C': 11.12, 'class_weight': None, 'penalty': 'l2'}

Model with rank: 5
Mean validation score: 0.72412 (std: 0.02872)
Parameters: {'C': 11.12, 'class_weight': None, 'penalty': 'l1'}



In [19]:
train_score=gs.best_estimator_.predict_proba(x_train)[:,1]

In [20]:
real=y_train

In [21]:
cutoffs=np.linspace(0.001,0.999,999)

In [22]:
fbetas=[]

In [23]:
for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)
    
    fbetas.append(fbeta_score(y_train,predicted,2))

In [24]:
my_cutoff=cutoffs[fbetas==max(fbetas)]

In [25]:
predictions=(gs.predict_proba(cd_test)[:,1]>my_cutoff).astype(int)

In [26]:
score=pd.DataFrame({'V86':predictions})