In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
lst = os.listdir()

In [3]:
train_Files = [x for x in lst if x.endswith('_train.csv')]
test_Files = [x for x in lst if x.endswith('_test.csv')]

In [4]:
files = [x[:-10] for x in train_Files]

In [5]:
files

['Feat1_LGBM_Gruped_Cats_FE',
 'CGB_Feat5_All_Possible_Feats_5_Fold',
 'Feat_5_LGB_ItemPast_Removed',
 'CGB_Feat5_Extra_Past_Item_5_Fold',
 'XGB_0.02_0.8_17_Feat_5_label_encoding_Coup_Removed',
 'Feat_5_LGB_Coup_Removed_1337']

In [6]:
train_Files

['Feat1_LGBM_Gruped_Cats_FE_train.csv',
 'CGB_Feat5_All_Possible_Feats_5_Fold_train.csv',
 'Feat_5_LGB_ItemPast_Removed_train.csv',
 'CGB_Feat5_Extra_Past_Item_5_Fold_train.csv',
 'XGB_0.02_0.8_17_Feat_5_label_encoding_Coup_Removed_train.csv',
 'Feat_5_LGB_Coup_Removed_1337_train.csv']

In [8]:
Group1 = ['CGB_Feat5_Extra_Past_Item_5_Fold', 
         'Feat1_LGBM_Gruped_Cats_FE', 
         'Feat_5_LGB_ItemPast_Removed',
         'Feat_5_LGB_Coup_Removed_1337', 
         'CGB_Feat5_All_Possible_Feats_5_Fold']

In [9]:
group1Tup = [(x + '_train.csv', x + '_test.csv') for x in Group1]

In [10]:
group1Tup

[('CGB_Feat5_Extra_Past_Item_5_Fold_train.csv',
  'CGB_Feat5_Extra_Past_Item_5_Fold_test.csv'),
 ('Feat1_LGBM_Gruped_Cats_FE_train.csv', 'Feat1_LGBM_Gruped_Cats_FE_test.csv'),
 ('Feat_5_LGB_ItemPast_Removed_train.csv',
  'Feat_5_LGB_ItemPast_Removed_test.csv'),
 ('Feat_5_LGB_Coup_Removed_1337_train.csv',
  'Feat_5_LGB_Coup_Removed_1337_test.csv'),
 ('CGB_Feat5_All_Possible_Feats_5_Fold_train.csv',
  'CGB_Feat5_All_Possible_Feats_5_Fold_test.csv')]

In [11]:
for x,y in group1Tup:
    print(x,y)
    assert x in train_Files
    assert y in test_Files

CGB_Feat5_Extra_Past_Item_5_Fold_train.csv CGB_Feat5_Extra_Past_Item_5_Fold_test.csv
Feat1_LGBM_Gruped_Cats_FE_train.csv Feat1_LGBM_Gruped_Cats_FE_test.csv
Feat_5_LGB_ItemPast_Removed_train.csv Feat_5_LGB_ItemPast_Removed_test.csv
Feat_5_LGB_Coup_Removed_1337_train.csv Feat_5_LGB_Coup_Removed_1337_test.csv
CGB_Feat5_All_Possible_Feats_5_Fold_train.csv CGB_Feat5_All_Possible_Feats_5_Fold_test.csv


In [12]:
trainGroup1DF = pd.DataFrame()
testGroup1DF = pd.DataFrame()
for trainCol, testCol in group1Tup:
    temp = pd.read_csv(trainCol)
    trainGroup1DF = pd.concat([trainGroup1DF, temp],axis = 1)
    temp = pd.read_csv(testCol)
    testGroup1DF = pd.concat([testGroup1DF, temp],axis = 1)

In [13]:
trainGroup1DF.shape, testGroup1DF.shape

((78369, 5), (50226, 5))

In [14]:
trainCols = set(trainGroup1DF.columns).intersection(testGroup1DF.columns)

In [16]:
## Loading Orig Labels 

train = pd.read_csv('train.csv')
trainGroup1DF['redemption_status'] = train.redemption_status

In [17]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import KFold
gkf = KFold(n_splits=5)
from sklearn.metrics import roc_auc_score
import xgboost
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [18]:
oof = np.zeros_like(trainGroup1DF.redemption_status).astype('float')
oot = np.zeros_like(testGroup1DF.index).astype('float')

def getResultSummary(res):
    trainMean = np.mean(list(map(lambda x : x[0], res)))
    validMean = np.mean(list(map(lambda x : x[1], res)))
    trainStd = np.std(list(map(lambda x : x[0], res)))
    validStd = np.std(list(map(lambda x : x[1], res)))
    return {
        
        'Train Mean' : trainMean, 
        'Valid Mean' : validMean, 
        'Train Std'  : trainStd,
        'Valid Std'  : validStd
    }

results = []
featureImp = []
i = 0
for train_index, test_index in gkf.split(trainGroup1DF[trainCols], trainGroup1DF['redemption_status']):
    train_X = trainGroup1DF.iloc[train_index][trainCols]
    valid_X = trainGroup1DF.iloc[test_index][trainCols]
    train_y = trainGroup1DF.iloc[train_index]['redemption_status']
    valid_y = trainGroup1DF.iloc[test_index]['redemption_status']
    model = LogisticRegression(C = 0.1)
    model.fit(train_X, train_y)
    valid_pred = model.predict_proba(valid_X)[:,1]
    train_pred = model.predict_proba(train_X)[:,1]
    trainRes = roc_auc_score(train_y, train_pred)
    validRes = roc_auc_score(valid_y, valid_pred)
    oof[test_index] = valid_pred
    ootPred = model.predict_proba(testGroup1DF[trainCols])[:,1]
    oot += ootPred / 5
    results.append((trainRes, validRes))
    i+=1



In [19]:
getResultSummary(results)

{'Train Mean': 0.9636807663510514,
 'Valid Mean': 0.963866198902261,
 'Train Std': 0.002116053668242637,
 'Valid Std': 0.009112195176333472}

In [20]:
def saveStackingFile(prefix):
    oof_ = pd.DataFrame(oof,columns = [prefix])
    oot_ = pd.DataFrame(oot,columns = [prefix])
    print(oof_.shape, oot_.shape)
    oof_.to_csv(prefix + '_train.csv', index = False)
    oot_.to_csv(prefix + '_test.csv', index = False)

saveStackingFile('LOGITLEVEL1')

(78369, 1) (50226, 1)
