In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Data can be down loaded here (need sign in): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [206]:
train_data = pd.read_csv("Big_Mart_Train.csv")
test_data = pd.read_csv("Big_Mart_Test.csv")

train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [207]:
train_data.Item_Weight = train_data.Item_Weight.fillna(np.nanmedian(train_data.Item_Weight))
test_data.Item_Weight = test_data.Item_Weight.fillna(np.nanmedian(test_data.Item_Weight))

train_data.Outlet_Size = train_data.Outlet_Size.fillna(train_data.Outlet_Size.mode().iloc[0])
test_data.Outlet_Size = test_data.Outlet_Size.fillna(test_data.Outlet_Size.mode().iloc[0])

train_data.Item_Fat_ContentItem_Fat  = train_data.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test_data.Item_Fat_Content = test_data.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train_data.Item_Fat_Content = train_data.Item_Fat_Content.replace(['reg'], ['Regular'])
test_data.Item_Fat_Content = test_data.Item_Fat_Content.replace(['reg'], ['Regular'])

train_data.Outlet_Establishment_Year = 2017 - train_data.Outlet_Establishment_Year
test_data.Outlet_Establishment_Year = 2017 - test_data.Outlet_Establishment_Year

test_data['Item_Outlet_Sales'] = 0
combi = train_data.append(test_data)
number = LabelEncoder()

for i in combi.columns:
    if (combi[i].dtype == 'object'):
        combi[i] = number.fit_transform(combi[i].astype('str'))
        combi[i] = combi[i].astype('float')
        
train_data = combi[:train_data.shape[0]]
test_data = combi[train_data.shape[0]:]

train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156.0,9.3,1.0,0.016047,4.0,249.8092,9.0,18,1.0,0.0,1.0,3735.138
1,8.0,5.92,2.0,0.019278,14.0,48.2692,3.0,8,1.0,2.0,2.0,443.4228
2,662.0,17.5,1.0,0.01676,10.0,141.618,9.0,18,1.0,0.0,1.0,2097.27
3,1121.0,19.2,2.0,0.0,6.0,182.095,0.0,19,1.0,2.0,0.0,732.38
4,1297.0,8.93,1.0,0.0,9.0,53.8614,1.0,30,0.0,2.0,1.0,994.7052


In [208]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

In [209]:
# Leave one out
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
loo.get_n_splits(train_data)

sample_ct = 0
for train_index, validation_index in loo.split(train_data):
    print("train:", train_index, "validation:", validation_index)
    
    loo_train = train_data.iloc[train_index]
    loo_validation = train_data.iloc[validation_index]
    
    sample_ct += 1
    if sample_ct == 5:
        break

('train:', array([   1,    2,    3, ..., 8520, 8521, 8522]), 'validation:', array([0]))
('train:', array([   0,    2,    3, ..., 8520, 8521, 8522]), 'validation:', array([1]))
('train:', array([   0,    1,    3, ..., 8520, 8521, 8522]), 'validation:', array([2]))
('train:', array([   0,    1,    2, ..., 8520, 8521, 8522]), 'validation:', array([3]))
('train:', array([   0,    1,    2, ..., 8520, 8521, 8522]), 'validation:', array([4]))


In [210]:
# repeated k-fold
# # repeat k-fold n times with different randomization in each repetition
from sklearn.model_selection import RepeatedKFold

kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) 

sample_ct = 0
for train_index, validation_index in kf.split(train_data):
    print("train length:", len(train_index), "validation length:", len(validation_index))
    
    kf_train = train_data.iloc[train_index]
    kf_validation = train_data.iloc[validation_index]
    
    sample_ct += 1
    if sample_ct == 5:
        break

('train length:', 6818, 'validation length:', 1705)
('train length:', 6818, 'validation length:', 1705)
('train length:', 6818, 'validation length:', 1705)
('train length:', 6819, 'validation length:', 1704)
('train length:', 6819, 'validation length:', 1704)


In [211]:
# stratified k-fold, it tries to make sure each fold has similar distribution with other folds
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=410)
# sklearn stratified k-fold only supports binary/multi-class labels, not regression labels, dtype has to be 'category'
train_data['Outlet_Type'] = train_data['Outlet_Type'].astype('category')  
X = train_data.iloc[:,0:-2]
y = train_data.loc[:,'Outlet_Type']

sample_ct = 0
for train_index, validation_index in skf.split(X, y):
    print("train length:", len(train_index), "validation length:", len(validation_index))
    
    skf_train_X = X.iloc[train_index]
    skf_validation_X = X.iloc[validation_index]
    
    skf_train_y = y.iloc[train_index]
    skf_validation_y = y.iloc[validation_index]
    
    sample_ct += 1
    if sample_ct == 5:
        break

('train length:', 6817, 'validation length:', 1706)
('train length:', 6817, 'validation length:', 1706)
('train length:', 6818, 'validation length:', 1705)
('train length:', 6820, 'validation length:', 1703)
('train length:', 6820, 'validation length:', 1703)


In [212]:
# Adversarial Validation
# # It checks the degree of similarity between training and tests in terms of feature distribution
import xgboost as xgb

train_data.drop(['Item_Outlet_Sales'], axis = 1, inplace = True)  # drop label
test_data.drop(['Item_Outlet_Sales'], axis = 1, inplace = True)

train_data['is_train'] = 1
test_data['is_train'] = 0

df = pd.concat([train_data, test_data], axis = 0)  # combine training, testing data
y = df['is_train']
df.drop('is_train', axis = 1, inplace = True) 

In [226]:
xgb_params = {'learning_rate': 0.05, 
              'max_depth': 4,
              'subsample': 0.9,        
              'colsample_bytree': 0.9,
              'objective': 'binary:logistic',
              'silent': 1, 
              'n_estimators':100, 
              'gamma':1,         
              'min_child_weight':4,
              'seed': 410}   
clf = xgb.XGBClassifier(**xgb_params)  # with "**" here, we can resolve the bug in xgboost
clf.fit(df, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_delta_step=0,
       max_depth=4, min_child_weight=4, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=410, silent=1,
       subsample=0.9)

In [230]:
probs = clf.predict_proba(df)[:,1]
probs[4:10]

array([ 0.56907153,  0.57169026,  0.60950691,  0.96956617,  0.61280704,
        0.60482651], dtype=float32)

In [233]:
new_df = pd.DataFrame({'id':train_data['Item_Identifier'], 'probs':probs[0:len(train_data)]})
new_df = new_df.sort_values(by = 'probs', ascending=False)

In [238]:
val_set_ids = new_df.iloc[1:np.int(new_df.shape[0]*0.3),1]
train_set_ids = list(set(train_data['Item_Identifier']) - set(val_set_ids))
print len(train_set_ids)
print len(val_set_ids)

1559
2555


In [2]:
# cross validation for time series
# # use all the previous data as training data for the new testing data
from sklearn.model_selection import TimeSeriesSplit

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
tscv = TimeSeriesSplit(n_splits=3)

In [4]:
for train_index, val_index in tscv.split(X):
    print("Train:", train_index, "Validation:", val_index)
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

('Train:', array([0]), 'Validation:', array([1]))
[[1 2]] [[3 4]]
('Train:', array([0, 1]), 'Validation:', array([2]))
[[1 2]
 [3 4]] [[1 2]]
('Train:', array([0, 1, 2]), 'Validation:', array([3]))
[[1 2]
 [3 4]
 [1 2]] [[3 4]]
