In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = 'G:\\Data Science\\Python\\Project\\Project 1\\Consumer_Complaints_train.csv'
test = 'G:\\Data Science\\Python\\Project\\Project 1\\Consumer_Complaints_test_share.csv'

In [4]:
d_train = pd.read_csv(train)
d_test = pd.read_csv(test)

KeyError: 'Complaint ID'

In [6]:
d_train.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Consumer complaint narrative    object
Company public response         object
Company                         object
State                           object
ZIP code                        object
Tags                            object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Consumer disputed?              object
Complaint ID                     int64
dtype: object

In [7]:
d_train.drop(['Complaint ID'],axis=1,inplace=True)
d_test.drop(['Complaint ID'],axis=1,inplace=True)

In [8]:
for col in ['Date received','Date sent to company']:
    d_train[col] = pd.to_datetime(d_train[col],infer_datetime_format=True)
    d_test[col] = pd.to_datetime(d_test[col],infer_datetime_format=True)

In [9]:
d_train['day_diff'] = pd.to_numeric(d_train['Date sent to company'])-pd.to_numeric(d_train['Date received'])
d_test['day_diff'] = pd.to_numeric(d_test['Date sent to company'])-pd.to_numeric(d_test['Date received'])

In [10]:
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   Date received                 478421 non-null  datetime64[ns]
 1   Product                       478421 non-null  object        
 2   Sub-product                   339948 non-null  object        
 3   Issue                         478421 non-null  object        
 4   Sub-issue                     185796 non-null  object        
 5   Consumer complaint narrative  75094 non-null   object        
 6   Company public response       90392 non-null   object        
 7   Company                       478421 non-null  object        
 8   State                         474582 non-null  object        
 9   ZIP code                      474573 non-null  object        
 10  Tags                          67206 non-null   object        
 11  Consumer cons

In [11]:
d_train['day_diff']

0           86400000000000
1          518400000000000
2         1814400000000000
3                        0
4          691200000000000
                ...       
478416      86400000000000
478417      86400000000000
478418                   0
478419     259200000000000
478420                   0
Name: day_diff, Length: 478421, dtype: int64

In [12]:
for col in ['Date received','Date sent to company']:
    d_train.drop([col],1,inplace=True)
    d_test.drop([col],1,inplace=True)

In [13]:
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 16 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Product                       478421 non-null  object
 1   Sub-product                   339948 non-null  object
 2   Issue                         478421 non-null  object
 3   Sub-issue                     185796 non-null  object
 4   Consumer complaint narrative  75094 non-null   object
 5   Company public response       90392 non-null   object
 6   Company                       478421 non-null  object
 7   State                         474582 non-null  object
 8   ZIP code                      474573 non-null  object
 9   Tags                          67206 non-null   object
 10  Consumer consent provided?    135487 non-null  object
 11  Submitted via                 478421 non-null  object
 12  Company response to consumer  478421 non-null  object
 13 

In [14]:
for col in d_train.select_dtypes(['object']).columns:
    print(col,':',d_train[col].nunique())

Product : 12
Sub-product : 47
Issue : 95
Sub-issue : 68
Consumer complaint narrative : 74019
Company public response : 10
Company : 3276
State : 62
ZIP code : 25962
Tags : 3
Consumer consent provided? : 4
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [15]:
d_train.isnull().sum()

Product                              0
Sub-product                     138473
Issue                                0
Sub-issue                       292625
Consumer complaint narrative    403327
Company public response         388029
Company                              0
State                             3839
ZIP code                          3848
Tags                            411215
Consumer consent provided?      342934
Submitted via                        0
Company response to consumer         0
Timely response?                     0
Consumer disputed?                   0
day_diff                             0
dtype: int64

In [16]:
len(pd.isnull(d_train['Tags']))

478421

In [17]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    d_train[varname]=np.where(pd.isnull(d_train[col]),1,0)
    d_train.drop([col],1,inplace=True)
    d_test[varname]=np.where(pd.isnull(d_test[col]),1,0)
    d_test.drop([col],1,inplace=True)

In [18]:
d_train.head()

Unnamed: 0,Product,Issue,Company,State,ZIP code,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,day_diff,Sub_product_isNan,Sub_issue_isNan,Consumer_complaint_narrative_isNan,Company_public_response_isNan,Tags_isNan,Consumer_consent_provided_isNan
0,Credit card,Billing statement,Wells Fargo & Company,MI,48342,Web,Closed with explanation,Yes,No,86400000000000,1,1,1,1,0,1
1,Bank account or service,"Making/receiving payments, sending money",Santander Bank US,PA,18042,Referral,Closed,Yes,No,518400000000000,0,1,1,1,1,1
2,Credit reporting,Incorrect information on credit report,Equifax,CA,92427,Referral,Closed with non-monetary relief,Yes,No,1814400000000000,1,0,1,1,1,1
3,Credit card,Billing statement,U.S. Bancorp,GA,305XX,Web,Closed with monetary relief,Yes,No,0,1,1,0,0,0,0
4,Credit card,Transaction issue,Bank of America,MA,02127,Web,Closed with explanation,Yes,No,691200000000000,1,1,1,1,1,1


In [19]:
d_test.head()

Unnamed: 0,Product,Issue,Company,State,ZIP code,Submitted via,Company response to consumer,Timely response?,day_diff,Sub_product_isNan,Sub_issue_isNan,Consumer_complaint_narrative_isNan,Company_public_response_isNan,Tags_isNan,Consumer_consent_provided_isNan
0,Bank account or service,Deposits and withdrawals,Bank of America,CA,95691,Web,Closed with explanation,Yes,-86400000000000,0,1,1,1,1,1
1,Debt collection,Cont'd attempts collect debt not owed,"National Credit Adjusters, LLC",FL,32086,Web,Closed with explanation,Yes,0,0,0,1,1,1,0
2,Mortgage,"Loan servicing, payments, escrow account",Wells Fargo & Company,CA,94618,Web,Closed without relief,Yes,86400000000000,0,1,1,1,1,1
3,Credit reporting,Unable to get credit report/credit score,"TransUnion Intermediate Holdings, Inc.",FL,33584,Postal mail,Closed with non-monetary relief,Yes,432000000000000,1,0,1,0,0,1
4,Mortgage,"Loan modification,collection,foreclosure",Bank of America,FL,33543,Web,Closed with explanation,Yes,0,0,1,1,1,1,1


In [20]:
for col in d_train.select_dtypes(['object']).columns:
    print(col,':',d_train[col].nunique())

Product : 12
Issue : 95
Company : 3276
State : 62
ZIP code : 25962
Submitted via : 6
Company response to consumer : 7
Timely response? : 2
Consumer disputed? : 2


In [21]:
for col in ['ZIP code','Company']:
    d_train.drop([col],1,inplace=True)
    d_test.drop([col],1,inplace=True)

In [22]:
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 14 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Product                             478421 non-null  object
 1   Issue                               478421 non-null  object
 2   State                               474582 non-null  object
 3   Submitted via                       478421 non-null  object
 4   Company response to consumer        478421 non-null  object
 5   Timely response?                    478421 non-null  object
 6   Consumer disputed?                  478421 non-null  object
 7   day_diff                            478421 non-null  int64 
 8   Sub_product_isNan                   478421 non-null  int32 
 9   Sub_issue_isNan                     478421 non-null  int32 
 10  Consumer_complaint_narrative_isNan  478421 non-null  int32 
 11  Company_public_response_isNan       478

In [23]:
d_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119606 entries, 0 to 119605
Data columns (total 13 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Product                             119606 non-null  object
 1   Issue                               119606 non-null  object
 2   State                               118681 non-null  object
 3   Submitted via                       119605 non-null  object
 4   Company response to consumer        119606 non-null  object
 5   Timely response?                    119606 non-null  object
 6   day_diff                            119606 non-null  int64 
 7   Sub_product_isNan                   119606 non-null  int32 
 8   Sub_issue_isNan                     119606 non-null  int32 
 9   Consumer_complaint_narrative_isNan  119606 non-null  int32 
 10  Company_public_response_isNan       119606 non-null  int32 
 11  Tags_isNan                          119

In [24]:
d_train['Consumer disputed?']=np.where(d_train['Consumer disputed?']=="Yes",1,0)

In [25]:
k=d_train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    d_train[varname]=np.where(d_train['Issue']==val,1,0)
    d_test[varname]=np.where(d_test['Issue']==val,1,0)
del d_train['Issue']
del d_test['Issue']

In [26]:
d_train.head()

Unnamed: 0,Product,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,day_diff,Sub_product_isNan,Sub_issue_isNan,Consumer_complaint_narrative_isNan,...,Issue_Loan_modification_collection_foreclosure,Issue_Incorrect_information_on_credit_report,Issue_Loan_servicing__payments__escrow_account,Issue_Cont'd_attempts_collect_debt_not_owed,Issue_Account_opening__closing__or_management,Issue_Disclosure_verification_of_debt,Issue_Communication_tactics,Issue_Deposits_and_withdrawals,Issue_Application__originator__mortgage_broker,Issue_Billing_disputes
0,Credit card,MI,Web,Closed with explanation,Yes,0,86400000000000,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Bank account or service,PA,Referral,Closed,Yes,0,518400000000000,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,Credit reporting,CA,Referral,Closed with non-monetary relief,Yes,0,1814400000000000,1,0,1,...,0,1,0,0,0,0,0,0,0,0
3,Credit card,GA,Web,Closed with monetary relief,Yes,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Credit card,MA,Web,Closed with explanation,Yes,0,691200000000000,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
for col in d_train.select_dtypes(['object']).columns:
    print(col,':',d_train[col].nunique())

Product : 12
State : 62
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [28]:
k=d_train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    d_train[varname]=np.where(d_train['State']==val,1,0)
    d_test[varname]=np.where(d_test['State']==val,1,0)
del d_train['State']
del d_test['State']

In [29]:
for col in d_train.select_dtypes(['object']).columns:
    print(col,':',d_train[col].nunique())

Product : 12
Submitted via : 6
Company response to consumer : 7
Timely response? : 2


In [30]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(d_train[col],prefix=col,drop_first=True)
    d_train=pd.concat([temp,d_train],1)
    d_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(d_test[col],prefix=col,drop_first=True)
    d_test=pd.concat([temp,d_test],1)
    d_test.drop([col],1,inplace=True)

In [31]:
for col in d_train.columns:
    print(col,":", d_train[col].nunique())

Timely response?_Yes : 2
Company response to consumer_Closed with explanation : 2
Company response to consumer_Closed with monetary relief : 2
Company response to consumer_Closed with non-monetary relief : 2
Company response to consumer_Closed with relief : 2
Company response to consumer_Closed without relief : 2
Company response to consumer_Untimely response : 2
Submitted via_Fax : 2
Submitted via_Phone : 2
Submitted via_Postal mail : 2
Submitted via_Referral : 2
Submitted via_Web : 2
Product_Consumer Loan : 2
Product_Credit card : 2
Product_Credit reporting : 2
Product_Debt collection : 2
Product_Money transfers : 2
Product_Mortgage : 2
Product_Other financial service : 2
Product_Payday loan : 2
Product_Prepaid card : 2
Product_Student loan : 2
Product_Virtual currency : 2
Consumer disputed? : 2
day_diff : 398
Sub_product_isNan : 2
Sub_issue_isNan : 2
Consumer_complaint_narrative_isNan : 2
Company_public_response_isNan : 2
Tags_isNan : 2
Consumer_consent_provided_isNan : 2
Issue_Loan

In [32]:
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478421 entries, 0 to 478420
Data columns (total 56 columns):
 #   Column                                                        Non-Null Count   Dtype
---  ------                                                        --------------   -----
 0   Timely response?_Yes                                          478421 non-null  uint8
 1   Company response to consumer_Closed with explanation          478421 non-null  uint8
 2   Company response to consumer_Closed with monetary relief      478421 non-null  uint8
 3   Company response to consumer_Closed with non-monetary relief  478421 non-null  uint8
 4   Company response to consumer_Closed with relief               478421 non-null  uint8
 5   Company response to consumer_Closed without relief            478421 non-null  uint8
 6   Company response to consumer_Untimely response                478421 non-null  uint8
 7   Submitted via_Fax                                             478421 non-n

In [33]:
ld_train, ld_test = train_test_split(d_train, test_size = 0.2,random_state=2)
x_train=ld_train.drop(['Consumer disputed?'],1)
y_train=ld_train["Consumer disputed?"]
x_test=ld_test.drop(["Consumer disputed?"],1)
y_test=ld_test["Consumer disputed?"]

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
lrm = LogisticRegression(penalty='l2', max_iter=500,class_weight='balanced')

In [36]:
lrm.fit(x_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=500)

In [37]:
AUC_test = lrm.predict(x_test)

In [38]:
from matplotlib import pyplot

In [39]:
score = roc_auc_score(y_test,AUC_test)

In [40]:
score

0.5290058854576669

In [41]:
clf = RandomForestClassifier(verbose=1,n_jobs=-1)

In [42]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

In [57]:

def report(results, n_top=3):
    for i in range(1, n_top + 1):
            candidate = np.flatnonzero(results['rank_test_score'] == i)[0]
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [44]:
param_dist = {"n_estimators":[10,100,500,700],
              "max_depth": [3,5, None],
              "max_features": sp_randint(5, 11),
              "min_samples_split": sp_randint(5, 11),
              "min_samples_leaf": sp_randint(5, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [45]:
n_iter_search = 20

In [46]:
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

In [47]:
random_search.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.1min finished
[

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.3s
[Par

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 700 out of 700 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  1.1min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 700 out of 700 | elapsed:    1.1s finished
[

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   29.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Par

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.2s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.2s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.7s finished
[Parallel(n_jobs=8)]: Using back

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 700 out of 700 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 700 out of 700 | elapsed:    1.1s finished
[

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1, verbose=1),
                   n_iter=20,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018A40452EE0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018A402D8070>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000018A40452A60>,
                                        'n_estimators': [10, 100, 500, 700]})

In [58]:
grid_search.best_estimator_

NameError: name 'grid_search' is not defined

In [60]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.78771 (std: 0.00008)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 7, 'min_samples_split': 9, 'n_estimators': 500}

Model with rank: 2
Mean validation score: 0.78767 (std: 0.00006)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 8, 'min_samples_split': 9, 'n_estimators': 500}

Model with rank: 3
Mean validation score: 0.78755 (std: 0.00001)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 5, 'max_features': 10, 'min_samples_leaf': 10, 'min_samples_split': 8, 'n_estimators': 500}



In [61]:
rf=RandomForestClassifier(n_estimators=500,criterion='gini',min_samples_split=9,
                         bootstrap=True,max_depth=None,max_features=7,min_samples_leaf=7,
                          class_weight='balanced')

In [62]:
rf.fit(x_train,y_train)

RandomForestClassifier(class_weight='balanced', max_features=7,
                       min_samples_leaf=7, min_samples_split=9,
                       n_estimators=500)

In [64]:
predicted = rf.predict(x_test)

In [65]:
score = roc_auc_score(y_test,predicted)

In [66]:
score

0.5812728277020894

In [67]:
final_pred = rf.predict(d_test)

In [68]:
len(final_pred)

119606

In [69]:
Submission = pd.DataFrame(list(zip(d_test['Complaint ID'],final_pred)),columns=['Complaint ID','Consumer disputed?'])

KeyError: 'Complaint ID'

In [None]:
Submission.isnull().sum()

In [72]:
prediction1=np.where(final_pred == 1,"Yes","No")
pd.DataFrame(prediction1).to_csv('Kamlesh11_B_Project1_Part2.csv',index=False)


In [None]:
df_test=pd.DataFrame(list(zip(y_test,predicted)),columns=["real","predicted"])

In [None]:
k=pd.crosstab(df_test['real'],df_test["predicted"])
print(k)