In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from boruta import BorutaPy
from collections import Counter
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from knnmv_master.impute import KNNMVImputer

In [6]:
train = pd.read_csv('Interm/train.csv')
test = pd.read_csv('Interm/test.csv')

In [8]:
train_index = train.shape[0]
combine = train.append(test, ignore_index=True, sort=False)

In [9]:
list_one_hot_features = ['City_Type','Employer_Category','Category2','Category3']
target_cols = ['Target_1','Target_2']

# One hot encoding
for i in list_one_hot_features:
    temp_one_hot = pd.get_dummies(combine[[i]], columns=[i])
    combine = combine.join(temp_one_hot)


In [62]:
train = combine.iloc[0:train_index].reset_index(drop=True)
test = combine.iloc[train_index:].reset_index(drop=True)

## Training for camp type 1 and 2 (Favorable Outcome: Getting a health score)
** HS: Health Score

In [76]:
trainHS = train[(train['Category1'] == 'First') | (train['Category1'] == 'Second')].reset_index(drop=True)

testHS = test[(test['Category1'] == 'First') | (test['Category1'] == 'Second')].reset_index(drop=True)

In [77]:
X = trainHS[set(trainHS.columns).difference(set(['Patient_ID','Health_Camp_ID','Category1'] \
                                                + list_one_hot_features + target_cols))]

y = trainHS['Target_1']

tempHS = testHS[['Patient_ID','Health_Camp_ID']]
testHS = testHS[set(testHS.columns).difference(set(['Patient_ID','Health_Camp_ID','Category1'] \
                                                                  + list_one_hot_features))]


#### Train test split

In [78]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True) # Shuffle = True

train_X = train_X.reset_index(drop=True); train_y = train_y.reset_index(drop=True);
test_X = test_X.reset_index(drop=True); test_y = test_y.reset_index(drop=True)

#### Feature Selection

In [79]:
import os
cwd = os.getcwd()

%cd C:/Users/Lakhan/Desktop/Wyng/Demand Forecasting
from FeatureSelectionPipe import FeatureSelector

os.chdir(cwd)

C:\Users\Lakhan\Desktop\Wyng\Demand Forecasting


In [80]:
# Define steps
step1 = {'Constant Features': {'frac_constant_values': 0.98}}

step2 = {'Correlated Features': {'correlation_threshold': 0.85}}

step3 = {'Multicorrelated Features': {'acceptable_vif_threshold': 15}}

# # For RFECV Features, DecisionTreeRegressor can also be used as an estimator
# estimator = RandomForestClassifier(random_state = 42, n_estimators = 20) 

# step5 = {'RFECV Features': {'estimator': estimator,
#                     'cv': TimeSeriesSplit(n_splits=3), 
#                     'step': 1,
#                     'scoring': 'accuracy', 
#                     'verbose': 50}}

steps = [step1, step2, step3]

# Initialize FeatureSelector()
fs = FeatureSelector()

# Apply feature selection methods
fs.fit(train_X, train_y, steps)

fs.selected_features

selected_features = list(set(fs.selected_features + fs.critical_features))

Removing Constant Features
['Employer_Category_Food', 'Employer_Category_Retail', 'Var3', 'Category3_2', 'Category2_G', 'Employer_Category_Broadcasting', 'Employer_Category_Telecom', 'Category3_1', 'Employer_Category_Real Estate', 'Var4', 'Employer_Category_Education', 'Employer_Category_Transport', 'Employer_Category_BFSI', 'Employer_Category_Manufacturing', 'Employer_Category_Health']

Removing Correlated Features
[]

Removing Multicorrelated Features
['Category2_F', 'Education_Score']

Done selecting features


['City_Type_C',
 'Age',
 'Var2',
 'Employer_Category_Technology',
 'City_Type_E',
 'Var1',
 'City_Type_A',
 'City_Type_F',
 'Registration_to_Camp_End_Date',
 'City_Type_D',
 'Employer_Category_Software Industry',
 'Category2_C',
 'Twitter_Shared',
 'Employer_Category_Consulting',
 'Online_Follower',
 'Income',
 'Facebook_Shared',
 'LinkedIn_Shared',
 'City_Type_H',
 'Interaction_to_Registration',
 'Registration_to_Camp_Start_Date',
 'Category2_B',
 'Category2_E',
 'City_Type_B',
 'Employer_Category_Others',
 'Category2_D',
 'City_Type_I',
 'Category2_A',
 'Var5',
 'City_Type_G']

In [81]:
train_X_selected = fs.transform(train_X)
test_X_selected = fs.transform(test_X)

testHS = fs.transform(testHS)

In [82]:
# applying smote
# sm = SMOTE(sampling_strategy='auto',random_state=42)
sm = BorderlineSMOTE(sampling_strategy='auto',random_state=42)

X_res, y_res = sm.fit_resample(train_X_selected, train_y)
print('Original dataset shape %s' % Counter(train_y))
print('Resampled dataset shape %s' % Counter(y_res))

# train_X_selected, train_y
# X_res, y_res

Original dataset shape Counter({0.0: 40635, 1.0: 11103})
Resampled dataset shape Counter({0.0: 40635, 1.0: 40635})


In [83]:
import xgboost as xgb

# Initiate classifier instance
estimator = xgb.XGBClassifier(objective='binary:logistic')

# # XGBClassifier
# # early_stopping_rounds: The number of rounds without improvements after which we should stop
# param_grid = { 'objective': ['binary:logistic'],
#               'max_depth': [3,4,5,6], 'min_child_weight': [5,6,7,8],
#               'subsample': [0.7,0.8,0.9,1], 'colsample_bytree': [0.7,0.8,0.9,1],
# # eta parameter: https://www.kaggle.com/c/santander-customer-satisfaction/discussion/20208
#               'eta': [0.1,0.05], 'n_estimators': [25, 50, 100]
#          }


# # Initialize GridSearch object with 5-fold cross validation
# # error_score = 0 silences any exceptions for incorrect param combinations
# gscv = GridSearchCV(estimator, param_grid, cv = 5,  n_jobs= -1, verbose = 1, scoring = 'accuracy', error_score=0)

# # Fit gscv
# gscv = gscv.fit(X_res, y_res)

# # Get best parameters and score
# best_params = gscv.best_params_
# best_score = gscv.best_score_
        
# # Update classifier parameters
# estimator = estimator.set_params(**best_params)

# Fit classifier
estimator = estimator.fit(X_res, y_res)

# Make predictions
y_res_pred = estimator.predict(X_res)
test_y_pred = estimator.predict(test_X_selected)

# Measure performance
accuracy_train = accuracy_score(y_res, y_res_pred)
accuracy_test = accuracy_score(test_y, test_y_pred)

# Message to user
print(f'The accuracy of the classifier on the train set was: {accuracy_train*100}')
print(f'The accuracy of the classifier on the test set was: {accuracy_test*100}')

The accuracy of the classifier on the train set was: 80.16980435585087
The accuracy of the classifier on the test set was: 79.62891379976807


In [84]:
tempHS['Outcome'] = np.round(estimator.predict_proba(testHS)[:,1], 2)

## Training for camp type 3 (Favorable Outcome: Visiting a stall)
** VS: Visiting a stall

In [93]:
trainVS = train[train['Category1'] == 'Third'].reset_index(drop=True)
testVS = test[test['Category1'] == 'Third'].reset_index(drop=True)

In [94]:
X = trainVS[set(trainVS.columns).difference(set(['Patient_ID','Health_Camp_ID','Category1'] \
                                                + list_one_hot_features + target_cols))]

y = trainVS['Target_2']

tempVS = testVS[['Patient_ID','Health_Camp_ID']]
testVS = testVS[set(testVS.columns).difference(set(['Patient_ID','Health_Camp_ID','Category1'] \
                                                                  + list_one_hot_features))]


#### Train test split

In [95]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True) # Shuffle = True

train_X = train_X.reset_index(drop=True); train_y = train_y.reset_index(drop=True);
test_X = test_X.reset_index(drop=True); test_y = test_y.reset_index(drop=True)

#### Feature Selection

In [96]:
import os
cwd = os.getcwd()

%cd C:/Users/Lakhan/Desktop/Wyng/Demand Forecasting
from FeatureSelectionPipe import FeatureSelector

os.chdir(cwd)

C:\Users\Lakhan\Desktop\Wyng\Demand Forecasting


In [97]:
# Define steps
step1 = {'Constant Features': {'frac_constant_values': 0.98}}

step2 = {'Correlated Features': {'correlation_threshold': 0.85}}

step3 = {'Multicorrelated Features': {'acceptable_vif_threshold': 15}}

# # For RFECV Features, DecisionTreeRegressor can also be used as an estimator
# estimator = RandomForestClassifier(random_state = 42, n_estimators = 20) 

# step5 = {'RFECV Features': {'estimator': estimator,
#                     'cv': TimeSeriesSplit(n_splits=3), 
#                     'step': 1,
#                     'scoring': 'accuracy', 
#                     'verbose': 50}}

steps = [step1, step2, step3]

# Initialize FeatureSelector()
fs = FeatureSelector()

# Apply feature selection methods
fs.fit(train_X, train_y, steps)

fs.selected_features

selected_features = list(set(fs.selected_features + fs.critical_features))

Removing Constant Features
['Employer_Category_Food', 'Employer_Category_Retail', 'Var3', 'Category2_F', 'Category3_2', 'Category2_G', 'Employer_Category_Broadcasting', 'Category2_C', 'Employer_Category_Telecom', 'Category3_1', 'Employer_Category_Real Estate', 'Var4', 'Employer_Category_Education', 'Category2_B', 'Employer_Category_Transport', 'Category2_E', 'Employer_Category_BFSI', 'Employer_Category_Manufacturing', 'Category2_D', 'Category2_A', 'Employer_Category_Health']

Removing Correlated Features
['Registration_to_Camp_Start_Date']

Removing Multicorrelated Features
['Education_Score']

Done selecting features


['City_Type_C',
 'Age',
 'Var2',
 'Employer_Category_Technology',
 'City_Type_E',
 'Var1',
 'City_Type_A',
 'City_Type_F',
 'Registration_to_Camp_End_Date',
 'City_Type_D',
 'Employer_Category_Software Industry',
 'Twitter_Shared',
 'Employer_Category_Consulting',
 'Online_Follower',
 'Income',
 'Facebook_Shared',
 'LinkedIn_Shared',
 'City_Type_H',
 'Interaction_to_Registration',
 'City_Type_B',
 'Employer_Category_Others',
 'City_Type_I',
 'Var5',
 'City_Type_G']

In [98]:
train_X_selected = fs.transform(train_X)
test_X_selected = fs.transform(test_X)

testVS = fs.transform(testVS)

In [107]:
# applying smote
sm = SMOTE(sampling_strategy='auto',random_state=42)
# sm = BorderlineSMOTE(sampling_strategy='auto',random_state=42)

X_res, y_res = sm.fit_resample(train_X_selected, train_y)
print('Original dataset shape %s' % Counter(train_y))
print('Resampled dataset shape %s' % Counter(y_res))

# train_X_selected, train_y
# X_res, y_res

Original dataset shape Counter({1.0: 5233, 0.0: 2983})
Resampled dataset shape Counter({1.0: 5233, 0.0: 5233})


In [None]:
import xgboost as xgb

# Initiate classifier instance

# params = {'colsample_bytree': 0.9, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 8, 
#           'objective': 'binary:logistic', 'subsample': 1}

# params = {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 5, 'min_child_weight': 5, 
#            'n_estimators': 50, 'objective': 'binary:logistic', 'subsample': 0.9}

estimator = xgb.XGBClassifier()

# XGBClassifier
# early_stopping_rounds: The number of rounds without improvements after which we should stop
param_grid = { 'objective': ['binary:logistic'],
              'max_depth': [3,4,5,6], 'min_child_weight': [5,6,7,8],
              'subsample': [0.7,0.8,0.9,1], 'colsample_bytree': [0.7,0.8,0.9,1],
# eta parameter: https://www.kaggle.com/c/santander-customer-satisfaction/discussion/20208
              'eta': [0.1,0.05], 'n_estimators': [25, 50, 100]
         }


# Initialize GridSearch object with 5-fold cross validation
# error_score = 0 silences any exceptions for incorrect param combinations
gscv = GridSearchCV(estimator, param_grid, cv = 5,  n_jobs= -1, verbose = 1, scoring = 'accuracy', error_score=0)

# Fit gscv
gscv = gscv.fit(X_res, y_res)

# Get best parameters and score
best_params = gscv.best_params_
best_score = gscv.best_score_
        
# Update classifier parameters
estimator = estimator.set_params(**best_params)

# Fit classifier
estimator = estimator.fit(X_res, y_res)

# Make predictions
y_res_pred = estimator.predict(X_res)
test_y_pred = estimator.predict(test_X_selected)

# Measure performance
accuracy_train = accuracy_score(y_res, y_res_pred)
accuracy_test = accuracy_score(test_y, test_y_pred)

# Message to user
print(f'The accuracy of the classifier on the train set was: {accuracy_train*100}')
print(f'The accuracy of the classifier on the test set was: {accuracy_test*100}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 1536 candidates, totalling 7680 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.9min


In [101]:
tempVS['Outcome'] = np.round(estimator.predict_proba(testVS)[:,1], 2)

#### Submission

In [105]:
result = tempHS.append(tempVS, ignore_index=True, sort=False)

result.to_csv('V1.csv', index=0)