<a href="https://colab.research.google.com/github/krishnamohan-seelam/ml_colab/blob/master/BlackFriday_UsingKeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [2]:
#Execute them for first to install
if IN_COLAB:
    !pip install category_encoders
    !pip install feature-engine





In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import pandas_profiling
import seaborn as sns
from matplotlib import pyplot as plt

import scipy.stats as stats
from scipy.stats import chi2_contingency
import category_encoders as ce

In [5]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures

In [6]:
%matplotlib inline

In [7]:
def extended_describe(dataframe):
    extended_describe_df= dataframe.describe(include='all').T 
    extended_describe_df['null_count']= dataframe.isnull().sum()
    extended_describe_df['unique_count'] = dataframe.apply(lambda x: len(x.unique()))
    return extended_describe_df 

In [8]:
from collections import Counter
def detect_outliers(dataset , noutliers , columns):
    outlier_indices = []
    for column in columns:
        # 1st quartile (25%),# 3rd quartile (75%)
        q1 , q3 = np.percentile(dataset[column] , [25 , 75])

        # Interquartile range (IQR)
        iqr = q3 - q1

        # outlier step
        outlier_step = 1.5 * iqr

        lower_bound = q1 - outlier_step
        upper_bound = q3 + outlier_step

        # Determine a list of indices of outliers for feature col
        outlier_list_col = dataset[(dataset[column] < lower_bound) | (
        dataset[column] > upper_bound)].index
        outlier_indices.extend(outlier_list_col)

    outlier_indices = Counter(outlier_indices)
 
    multiple_outliers = list(k for k , v in outlier_indices.items()
                             if v > noutliers)

    return outlier_indices

In [9]:
def merge_df(left_df,right_df,how,on,suffixes):
    if not on:
        raise valueError("Unable to join dataframes as join cols not specified")
    how = how or 'left'
    suffixes = suffixes or ('_left','_right')
    return  left_df.merge(right=right_df,how =how,on =on,suffixes=suffixes)

def get_mapper(df,feature,target):
    return df.groupby([feature])[target].sum().rank(ascending= False).to_dict()

def get_mean_mapper(df,feature,target):
    return df.groupby([feature])[target].mean().to_dict()

In [10]:
import collections 

from sklearn.base import TransformerMixin


class FeatureTransformer(TransformerMixin):
    # FunctionTransformer but for pandas DataFrames

    def __init__(self, transformer_config):
        self.transformer_config = transformer_config

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        X = X.copy()
        for column, tr_configs in self.transformer_config.items():
            for tr_config in tr_configs:
                new_col_name = tr_config.column if tr_config.column else column
                transform_function = tr_config.function if tr_config.function else None
                default_value  = tr_config.default if tr_config.default else None

                X[new_col_name] = X[column].map(lambda x: tr_config.function.get(x, default_value))
                if tr_config.astype:
                    X[new_col_name] = X[new_col_name].astype(tr_config.astype)
                
        return X

class ZeroFillTransformer(TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xz = X.copy()
        for col in self.cols:
            Xz[col] = X[col].fillna(value=0)
        
        return Xz

In [11]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = 'drive/My Drive/datasource/blackfriday/{file}'
    source_path = Path.cwd()
else:
    data_path='{file}'
    source_path =Path.cwd().joinpath('datasource')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
train_file = source_path.joinpath(data_path.format(file='train.csv'))
test_file =  source_path.joinpath(data_path.format(file='test.csv'))

In [13]:
train_df = pd.read_csv(train_file)

test_df = pd.read_csv(test_file)

test_df_copy = test_df.copy(deep=True)

In [14]:
outliers = detect_outliers(train_df,2,["Purchase"])

In [15]:
remove_idx = list(outliers.keys())
train_df.drop(remove_idx,inplace=True,axis =0)

In [16]:
y = train_df['Purchase']
X = train_df.drop(['Purchase'],axis=1)

In [17]:
cols=['Gender','City_Category',]

In [18]:
cb_enc = ce.one_hot.OneHotEncoder(verbose=1, cols=cols,use_cat_names=True)
cb_enc.fit(X,y)
X = cb_enc.transform(X)
test_df = cb_enc.transform(test_df)

In [19]:
occupation_mapper = get_mapper(train_df,'Occupation','Purchase')
age_mapper = get_mapper(train_df,'Age','Purchase')
stay_in_city_mapper = get_mapper(train_df,'Stay_In_Current_City_Years','Purchase')
product_id_mapper = get_mean_mapper(train_df,'Product_ID','Purchase')

user_id_mapper = train_df['User_ID'].value_counts().to_dict() 
user_id_avg_mapper = get_mean_mapper(train_df,'User_ID','Purchase')

In [20]:
userid_category =pd.qcut(train_df.groupby(['User_ID'])['Purchase'].sum() ,[0.0,0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 ,0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95,1.0], labels=False).fillna(-1)
userid_category.fillna(-1,inplace=True)
userid_category= userid_category.astype(int)
userid_category_mapper = userid_category.to_dict()


In [21]:
all_prd_cats = train_df.groupby(['Product_Category_1','Product_Category_2','Product_Category_3']).agg(Prd_Cat_Avg=pd.NamedAgg(column='Purchase', aggfunc='mean'),
                                                                                       Prd_Cat_Count=pd.NamedAgg(column='Purchase', aggfunc='count')
                                                                                      ).reset_index()

In [22]:
prd_cats_1_only  = train_df[(train_df['Product_Category_2'].isna())&(train_df['Product_Category_3'].isna()) ].groupby(['Product_Category_1']).agg(Prd_Cat_Avg=pd.NamedAgg(column='Purchase', aggfunc='mean'),
                                                                                       Prd_Cat_Count=pd.NamedAgg(column='Purchase', aggfunc='count')
                                                                                      ).reset_index()
                                                                                 

In [23]:
occupation_mapper = get_mapper(train_df,'Occupation','Purchase')
age_mapper = get_mapper(train_df,'Age','Purchase')
stay_in_city_mapper = get_mapper(train_df,'Stay_In_Current_City_Years','Purchase')
product_id_mapper = get_mean_mapper(train_df,'Product_ID','Purchase')
user_id_mapper = train_df['User_ID'].value_counts().to_dict() 
user_id_avg_mapper = get_mean_mapper(train_df,'User_ID','Purchase')
product_cat1_mapper = train_df['Product_Category_1'].value_counts().to_dict() 

In [24]:
TransformerConfig = collections.namedtuple('TransformerConfig',['function','default','astype','column'])  
transformer_config = {'Occupation': [TransformerConfig(occupation_mapper,-1,'int',None)],
                      'Age': [TransformerConfig(age_mapper,-1,'int','Age_tr')],
                      'Stay_In_Current_City_Years':[TransformerConfig(stay_in_city_mapper,-1,'int',None)],
                      'Product_ID':[TransformerConfig(product_id_mapper,-1,'int','Purchase_Avg_By_Product_ID')],
                      'User_ID':[TransformerConfig(user_id_avg_mapper,-1,'int','Purchase_Avg_By_User_ID'),
                                 TransformerConfig(user_id_mapper,0,'int','User_ID_Count'),
                                 TransformerConfig(userid_category_mapper,-1,'int','User_ID_Category'),
                                 
                                ],
                      'Product_Category_1':[TransformerConfig(product_cat1_mapper,-1,'int','Count_By_Product_Category_1')]
                 } 

In [25]:
ft = FeatureTransformer(transformer_config)
zfill_tr = ZeroFillTransformer(['Product_Category_2','Product_Category_3'])

In [26]:
X = ft.fit_transform(X)
test_df = ft.transform(test_df)

In [27]:
X = zfill_tr.fit_transform(X)
test_df = zfill_tr.transform(test_df)

In [28]:
X['Product_Category_2']= X['Product_Category_2'].astype(int)
X['Product_Category_3']= X['Product_Category_3'].astype(int)
test_df['Product_Category_2']= test_df['Product_Category_2'].astype(int)
test_df['Product_Category_3']= test_df['Product_Category_3'].astype(int)

In [29]:
X['User_ID'] = X['User_ID'] - 1000000
test_df['User_ID'] = test_df['User_ID'] - 1000000

In [30]:
X['Product_ID'] = X['Product_ID'].str.replace('P00', '').astype('int')
test_df['Product_ID'] = test_df['Product_ID'].str.replace('P00', '').astype('int')

In [31]:
X.drop(['Age','City_Category_A','City_Category_B','City_Category_C','Gender_M'],axis =1,inplace =True)
test_df.drop(['Age','City_Category_A','City_Category_B','City_Category_C','Gender_M'],axis=1,inplace=True)

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)

In [33]:
X_train.head()

Unnamed: 0,User_ID,Product_ID,Gender_F,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Age_tr,Purchase_Avg_By_Product_ID,Purchase_Avg_By_User_ID,User_ID_Count,User_ID_Category,Count_By_Product_Category_1
538859,4964,318742,0,2,1,1,1,8,0,2,14344,8747,244,17,140378
219651,3848,195842,0,13,1,0,3,4,8,1,10325,9821,41,7,20213
524297,2810,154042,1,14,2,0,1,2,15,7,12167,8370,232,16,140378
54967,2419,236342,0,2,3,1,5,14,0,1,4808,8032,306,17,150933
189098,5193,303042,0,6,2,1,5,0,0,2,6330,8804,115,13,150933


In [35]:
test_df.head()

Unnamed: 0,User_ID,Product_ID,Gender_F,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Age_tr,Purchase_Avg_By_Product_ID,Purchase_Avg_By_User_ID,User_ID_Count,User_ID_Category,Count_By_Product_Category_1
0,4,128942,0,3,2,1,1,11,0,4,15781,14747,14,3,140378
1,9,113442,0,5,5,0,3,5,0,1,11746,10243,58,9,20213
2,10,288442,1,4,4,1,5,14,0,2,5731,9728,223,17,150933
3,10,145342,1,4,4,1,4,9,0,2,1943,9728,223,17,11753
4,11,53842,1,4,1,0,4,5,12,1,2585,7957,70,9,11753
