In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

from mlsettings.settings import load_app_config, get_datafolder_path
 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 
np.set_printoptions(precision=4)

pd.set_option('display.width', 200)
pd.set_option('precision', 4)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
pd.options.display.float_format = '{:,.4f}'.format
sns.set()
import logging
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logger.setLevel(logging.DEBUG)


In [3]:
load_app_config()
DATA_DIRECTORY='BigMartSales'
TRAIN_FILE = 'train.csv'
TEST_FILE  = 'test.csv'
input_path = get_datafolder_path()
print(f'Input Path:{input_path}')

Input Path:D:\DataSource


In [4]:
file_path = pathlib.Path(input_path).joinpath(DATA_DIRECTORY)
input_file = file_path.joinpath(TRAIN_FILE)

In [5]:
train_df  = pd.read_csv(input_file,encoding='utf-8')

In [6]:
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.0193,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.0168,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [7]:
def extended_describe(dataframe):
    extended_describe_df= dataframe.describe(include='all').T 
    extended_describe_df['null_count']= dataframe.isnull().sum()
    extended_describe_df['unique_count'] = dataframe.apply(lambda x: len(x.unique()))
    extended_describe_df['data_type'] =dataframe.dtypes
    return extended_describe_df 

In [8]:
desc_df = extended_describe(train_df)
desc_df[desc_df['null_count']!=0]

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count,unique_count,data_type
Item_Weight,7060.0,,,,12.8576,4.6435,4.555,8.7737,12.6,16.85,21.35,1463,416,float64
Outlet_Size,6113.0,3.0,Medium,2793.0,,,,,,,,2410,4,object


In [9]:
from feature_engine.missing_data_imputers import MeanMedianImputer

In [10]:
median_imputer = MeanMedianImputer(imputation_method='median',variables=['Item_Weight'])

In [11]:
median_imputer.fit(train_df)

MeanMedianImputer(imputation_method='median', variables=['Item_Weight'])

In [12]:
median_imputer.imputer_dict_

{'Item_Weight': 12.6}

In [13]:
train_df = median_imputer.transform(train_df)

In [14]:
desc_df = extended_describe(train_df)
desc_df[desc_df['null_count']!=0]

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count,unique_count,data_type
Outlet_Size,6113,3,Medium,2793,,,,,,,,2410,4,object


In [15]:
from feature_engine.missing_data_imputers import FrequentCategoryImputer

In [16]:
mode_imputer = FrequentCategoryImputer(variables=['Outlet_Size'])

In [17]:
mode_imputer.fit(train_df)

FrequentCategoryImputer(variables=['Outlet_Size'])

In [18]:
train_df = mode_imputer.transform(train_df)

In [21]:
desc_df = extended_describe(train_df)
desc_df 

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count,unique_count,data_type
Item_Identifier,8523.0,1559.0,FDW13,10.0,,,,,,,,0,1559,object
Item_Weight,8523.0,,,,12.8134,4.2272,4.555,9.31,12.6,16.0,21.35,0,415,float64
Item_Fat_Content,8523.0,5.0,Low Fat,5089.0,,,,,,,,0,5,object
Item_Visibility,8523.0,,,,0.0661,0.0516,0.0,0.027,0.0539,0.0946,0.3284,0,7880,float64
Item_Type,8523.0,16.0,Fruits and Vegetables,1232.0,,,,,,,,0,16,object
Item_MRP,8523.0,,,,140.9928,62.2751,31.29,93.8265,143.0128,185.6437,266.8884,0,5938,float64
Outlet_Identifier,8523.0,10.0,OUT027,935.0,,,,,,,,0,10,object
Outlet_Establishment_Year,8523.0,,,,1997.8319,8.3718,1985.0,1987.0,1999.0,2004.0,2009.0,0,9,int64
Outlet_Size,8523.0,3.0,Medium,5203.0,,,,,,,,0,3,object
Outlet_Location_Type,8523.0,3.0,Tier 3,3350.0,,,,,,,,0,3,object


In [20]:
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [25]:
cat_columns  =  [ col for col in train_df.select_dtypes(include='object').columns if col not in ['Item_Identifier','Outlet_Identifier']]

In [26]:
cat_columns

['Item_Fat_Content',
 'Item_Type',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [29]:
ohe_enc = OneHotCategoricalEncoder(top_categories=10, variables=cat_columns, drop_last=False)

In [30]:
ohe_enc.fit(train_df)

OneHotCategoricalEncoder(drop_last=False, top_categories=10,
                         variables=['Item_Fat_Content', 'Item_Type',
                                    'Outlet_Size', 'Outlet_Location_Type',
                                    'Outlet_Type'])

In [31]:
train_df_ohc = ohe_enc.transform(train_df)

In [32]:
train_df_ohc.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_LF,Item_Fat_Content_reg,Item_Fat_Content_low fat,Item_Type_Fruits and Vegetables,Item_Type_Snack Foods,Item_Type_Household,Item_Type_Frozen Foods,Item_Type_Dairy,Item_Type_Canned,Item_Type_Baking Goods,Item_Type_Health and Hygiene,Item_Type_Soft Drinks,Item_Type_Meat,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_High,Outlet_Location_Type_Tier 3,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 1,Outlet_Type_Supermarket Type1,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type3,Outlet_Type_Supermarket Type2
0,FDA15,9.3,0.016,249.8092,OUT049,1999,3735.138,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
1,DRC01,5.92,0.0193,48.2692,OUT018,2009,443.4228,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1
2,FDN15,17.5,0.0168,141.618,OUT049,1999,2097.27,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0
3,FDX07,19.2,0.0,182.095,OUT010,1998,732.38,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0
4,NCD19,8.93,0.0,53.8614,OUT013,1987,994.7052,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0
