# ML Data preProcessing

It is important to process the data into a format where it can be fed to a machine learning model. This typically means converting all non-numeric columns to numeric, handling NaN values and generating new features from already existing features.

In [1]:
#importing the libraries
import numpy as np
import pandas as pd
from scipy import stats
import datetime 
import plotly.express as px
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Adding scripts path
import sys
sys.path.insert(0,'../scripts/')
from data_preProcessing import data_preProcessing_script
from data_cleaner import DataCleaner

In [3]:
# Reading data sets
store_data = pd.read_csv('../data/store.csv')
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [4]:
# Merging the training dataset with the store dataset 
train_store=pd.merge(train_data, store_data, on='Store', how='inner')

In [5]:
# Merging the test dataset with the store dataset
test_store=pd.merge(test_data, store_data, on='Store', how='inner')

In [6]:
# showing the data 
train_store.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


In [7]:
# getting the information
preprocess = data_preProcessing_script(train_store)
preprocess.show_data_information()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  object 
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  object 
 10  Assortment                 1017209 non-null  object 
 11  CompetitionDistance        1014567 non-null  float64
 12  CompetitionOpenSinceMonth  693861 non-null   float64
 13  CompetitionO

In [9]:
# changeing date object to datetime type
train_store.Date = pd.to_datetime(train_store["Date"])

In [10]:
# object creation
train_store_cleaner = DataCleaner(train_store)

In [12]:
preprocess.show_data_information()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int64         
 1   DayOfWeek                  1017209 non-null  int64         
 2   Date                       1017209 non-null  datetime64[ns]
 3   Sales                      1017209 non-null  int64         
 4   Customers                  1017209 non-null  int64         
 5   Open                       1017209 non-null  int64         
 6   Promo                      1017209 non-null  int64         
 7   StateHoliday               1017209 non-null  object        
 8   SchoolHoliday              1017209 non-null  int64         
 9   StoreType                  1017209 non-null  object        
 10  Assortment                 1017209 non-null  object        
 11  CompetitionDistance        1014567 no

# Missing Value Manipulation

In [13]:
preprocess.colums_WithMissingValue()

['CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval']

In [14]:
preprocess.get_column_based_missing_percentage()

Unnamed: 0,total_missing_values,missing_percentage
Store,0,0.0 %
DayOfWeek,0,0.0 %
Date,0,0.0 %
Sales,0,0.0 %
Customers,0,0.0 %
Open,0,0.0 %
Promo,0,0.0 %
StateHoliday,0,0.0 %
SchoolHoliday,0,0.0 %
StoreType,0,0.0 %


In [15]:
# Filling Missing Values
try:
    train_store['CompetitionDistance'].fillna(train_store['CompetitionDistance'].median(), inplace = True)
    train_store['CompetitionOpenSinceMonth'].fillna(train_store['CompetitionOpenSinceMonth'].mode().iloc[0], inplace = True)
    train_store['CompetitionOpenSinceYear'].fillna(train_store['CompetitionOpenSinceYear'].mode().iloc[0], inplace = True)
# 
    train_store['Promo2SinceWeek'].fillna(train_store['Promo2SinceWeek'].mode().iloc[0], inplace=True)
    train_store['Promo2SinceYear'].fillna(train_store['Promo2SinceYear'].mode().iloc[0], inplace = True)
    train_store['PromoInterval'].fillna(train_store['PromoInterval'].mode().iloc[0], inplace=True)
#     
except Exception as e:
    print("Failed to fill missing vlaues")

In [16]:
preprocess.colums_WithMissingValue()

[]

In [17]:
#Knowing the shapes of our datasets
print('Shapes of our datasets')
print('-----------------------')
print(f'Training dataset:{train_store.shape}')

Shapes of our datasets
-----------------------
Training dataset:(1017209, 18)
