In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from dateutil.relativedelta import relativedelta
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
%matplotlib inline

# Importing Data set

In [2]:
data_train=pd.read_csv('Data\Consumer_Complaints_train.csv')

In [3]:
data_test=pd.read_csv('Data\Consumer_Complaints_test_share.csv')

In [4]:
data_test['Consumer disputed?']=np.nan

In [5]:
data_train['data'] = 'train'
data_test['data'] = 'test'
data_test=data_test[data_train.columns] # the columns in the two data frames should be in the same order to enable concatenation
data_all=pd.concat([data_train,data_test],axis=0) 

# Removing Garbage columns

In [6]:
garbage_cols=data_all.isnull().sum()[data_all.isnull().sum()>(0.5*data_all.shape[0])].keys() 

In [7]:
data_all.drop(columns=garbage_cols,axis=1,inplace=True)

In [8]:
data_all.drop(columns=["Complaint ID"],inplace=True)

# Imputing Missing Values

In [9]:
data_all["Sub-product"]=np.where(data_all["Sub-product"].isnull(),"not_available",data_all["Sub-product"])

In [10]:
data_all["State"]=np.where(data_all["State"].isnull(),"not_available",data_all["State"])

In [11]:
data_all["ZIP code"]=np.where(data_all["ZIP code"].isnull(),"not_available",data_all["ZIP code"])

In [12]:
data_all["Submitted via"]=np.where(data_all["Submitted via"].isnull(),data_all["Submitted via"].mode()[0],data_all["Submitted via"])

# Formatting date

In [13]:
d1=pd.to_datetime(data_all["Date received"])
d2=pd.to_datetime(data_all["Date sent to company"])
data_all["resolution_gap_days"]=(d2-d1)/np.timedelta64(1,'D')
data_all.drop(columns =["Date received","Date sent to company"], inplace = True)

# Creating Dummies

In [4]:
def dummies(data,var,freq_cutoff=0):
    t=data[var].value_counts(normalize=True)  #get value counts of all the unique data in specified feature variable 
    t=t[t.values>freq_cutoff] #filter the values having count less than specified cutoff
    t=t.sort_values() #sort according to the value counts
    t_min=t.idxmin() # get the data having minimum count 
    t=t.drop([t_min]) # drop that data as we make n-1 dummies 
    categories=t.index # making dummies for rest unique data values

    for cat in categories :
        name=var+'_'+cat
        name=re.sub(" ","",name) 
        name=re.sub("-","_",name)
        name=re.sub("\\?","Q",name) 
        name=re.sub("<","LT_",name) 
        name=re.sub("\\+","",name) 
        name=re.sub("\\/","_",name) 
        name=re.sub(">","GT_",name) 
        name=re.sub("=","EQ_",name)
        name=re.sub(",","",name)
        data[name]=(data[var]==cat)+0 
               
    data=data.drop(columns=[var]) #dropping original feature variable after making its dummies
    return data

In [15]:
data_all=dummies(data_all,"Product",0.03)
data_all=dummies(data_all,"Sub-product",0.05)
data_all=dummies(data_all,"Issue",0.02)
data_all=dummies(data_all,"Company",0.02)
data_all=dummies(data_all,"State",0.04)
data_all=dummies(data_all,"ZIP code",0.002)
data_all=dummies(data_all,"Submitted via",0.05)
data_all=dummies(data_all,"Company response to consumer",0.05)
data_all=dummies(data_all,"Timely response?")

In [16]:
data_train=data_all[data_all['data']=='train'] 
del data_train['data'] 
data_test=data_all[data_all['data']=='test']
data_test.drop(['Consumer disputed?','data'],axis=1,inplace=True) 

In [17]:
train1, train2 = train_test_split(data_train, test_size = 0.2,random_state=2)

x_train1=train1.drop(["Consumer disputed?"],1)
y_train1=train1["Consumer disputed?"]

x_train2=train2.drop(["Consumer disputed?"],1)
y_train2=train2["Consumer disputed?"]

x_train1.reset_index(drop=True,inplace=True)
y_train1.reset_index(drop=True,inplace=True)

In [18]:
x_train1params={'n_estimators':[100,200,500,700],
       'criterion':['gini','entropy'],
       'min_samples_split':[5,7,9],
       'bootstrap':[True],
        'n_jobs':[-1],
       'max_depth':[5,10,20],
       'max_features':[5,10,20,30,40], 
       'min_samples_leaf':[6,8,10]}

In [19]:
5*2*5*2*5*7*6

21000

In [20]:
clf = RandomForestClassifier(class_weight='balanced')

In [21]:
random_search = RandomizedSearchCV(clf, param_distributions=x_train1params,
                                   n_iter=150)

In [22]:
random_search.fit(x_train1, y_train1)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight='balanced',
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    

In [30]:
rf=random_search.best_estimator_
rf

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features=40,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=10,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=700, n_jobs=-1, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [31]:
rf.fit(x_train1,y_train1)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features=40,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=10,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=700, n_jobs=-1, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [33]:
rf.classes_

array(['No', 'Yes'], dtype=object)

In [34]:
train1_score=rf.predict_proba(x_train1)[:,1]
train1_score

array([0.57690196, 0.40671795, 0.52087031, ..., 0.48828955, 0.38531605,
       0.48716749])

In [36]:
train2_score=rf.predict_proba(x_train2)[:,1]
train2_score

array([0.33398213, 0.39484201, 0.57146849, ..., 0.58341717, 0.4964964 ,
       0.57703189])

In [37]:
train1_classes=(train1_score>0.5).astype(int)
y_train1_classes=np.where(y_train1=="Yes",1,0)
train1_score[train1_score>0.5].size*100/train1_score.size #percent of Yes in train 1

49.54145938714937

In [38]:
train2_classes=(train2_score>0.5).astype(int)
y_train2_classes=np.where(y_train2=="Yes",1,0)
train2_score[train2_score>0.5].size*100/train2_score.size #percent of Yes in train 2

49.28045148142342

In [40]:

from sklearn.metrics import roc_auc_score

In [41]:
roc_auc_score(y_train1, train1_classes) #auc score of train 1

0.5777269278863529

In [42]:
roc_auc_score(y_train2, train2_classes) #auc score of train 2

0.578932276110413

In [43]:
test_score=rf.predict_proba(data_test)[:,1]

In [44]:
test_score
test_classes=np.where(test_score>0.5,"Yes","No")

In [52]:
test_classes[test_classes=="No"].size

60521

In [32]:
train1['Consumer disputed?'].value_counts()

No     301424
Yes     81312
Name: Consumer disputed?, dtype: int64

In [57]:
pd.DataFrame(test_classes).to_csv("laveena_valecha_project_RF.csv",header="Consumer disputed?",index=False)