In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [3]:
datafile_train=r'carvan_train.csv'
datafile_test=r'carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [4]:
# Here all the columns in the data are stored as integers but that 
# does not mean they represent numeric information 
# look at the data dictionary and decide what columns should be considered 
# as categorical and treated accordingly 

# if we read carefully, variables referring to L0 and L2 seem categorical
# If we think some other vars also seem categorical in nature , feel free to create dummies for them 
# in this script however we are treating all the columns as numeric . You can improve on it.

# make sure that when you create dummies , 
# they get created in same count for both train and test set

# many of the columns already binary 0/1, most probably created as dummies 
# from some other original column. you can let them be as is

In [5]:
cd_train.isnull().sum().sum(),cd_test.isnull().sum().sum() # there are no missing values in the data 

(0, 0)

In [6]:
target='V86'

In [7]:
cd_train[target].value_counts()

V86
0    5474
1     348
Name: count, dtype: int64

In [9]:
x_train=cd_train.drop(target,axis=1)
y_train=cd_train[target]

In [10]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(fit_intercept=True)


In [11]:
params={'penalty':['l1','l2'],
       'C':np.linspace(0.01,100,10),
       'class_weight':['balanced',None]}

In [12]:
from sklearn.model_selection import GridSearchCV


In [13]:
gs=GridSearchCV(model,cv=10,param_grid=params,n_jobs=-1,verbose=5,scoring='roc_auc')

In [14]:
gs.fit(x_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


In [15]:
gs.best_estimator_

In [16]:
train_score=gs.best_estimator_.predict_proba(x_train)[:,1]

In [17]:
real=y_train

In [18]:
cutoffs=np.linspace(0.001,0.999,999)

In [19]:
from sklearn.metrics import fbeta_score

In [20]:
fbetas=[]

In [22]:
for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)
    
    fbetas.append(fbeta_score(y_train,predicted,beta=2))
    

TypeError: fbeta_score() got an unexpected keyword argument 'axis'

In [21]:
my_cutoff=cutoffs[fbetas==max(fbetas)]

In [22]:
predictions=(gs.predict_proba(cd_test)[:,1]>my_cutoff).astype(int)

In [23]:
pd.Series(predictions).value_counts()

0    3012
1     988
dtype: int64

In [24]:
submissions=pd.DataFrame({'V86':predictions})

In [26]:
submissions.to_csv('sample_submission.csv',index=False)