# Feature Engineering

## Definition and a summary

Feature engineering - the process of transforming raw data into meaningful input features that better represent the underlying problem, improving the performance and accuracy of machine learning models. This critical data science technique involves selecting, creating, and transforming variables to enhance the data's predictive power and make it more suitable for algorithms to learn from. 

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import multiprocessing as mp
import gc
import datetime
from sklearn.preprocessing import LabelEncoder
import calendar
from scipy.sparse import csr_matrix,hstack
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from tqdm import tqdm
import pickle

First of all, let's read up the dataframes

In [6]:
train=pd.read_csv('final_dataframe.csv')
test=pd.read_csv('final_dataframe_test.csv')
final_test=pd.read_csv('final_future_data.csv')

  train=pd.read_csv('final_dataframe.csv')
  test=pd.read_csv('final_dataframe_test.csv')


It took more than 10 minutes to read all the dataframes. It would be easier if I reduce the memory of all of those by converting all categorical variables to integer. Also, we save here label encoders data so we can use them to encode our future unknown data.

In [7]:
lbl=LabelEncoder()
train['item_id']=lbl.fit_transform(train['item_id'])
test['item_id']=lbl.transform(test['item_id'])
final_test['item_id']=lbl.transform(final_test['item_id'])
pickle.dump(lbl,open('label_encoder_item_id.sav','wb'))

In [8]:
lbl=LabelEncoder()
train['dept_id']=lbl.fit_transform(train['dept_id'])
test['dept_id']=lbl.transform(test['dept_id'])
final_test['dept_id']=lbl.transform(final_test['dept_id'])
pickle.dump(lbl,open('label_encoder_dept_id.sav','wb'))

In [9]:
lbl=LabelEncoder()
train['cat_id']=lbl.fit_transform(train['cat_id'])
test['cat_id']=lbl.transform(test['cat_id'])
final_test['cat_id']=lbl.transform(final_test['cat_id'])
pickle.dump(lbl,open('label_encoder_cat_id.sav','wb'))

In [10]:
lbl=LabelEncoder()
train['store_id']=lbl.fit_transform(train['store_id'])
test['store_id']=lbl.transform(test['store_id'])
final_test['store_id']=lbl.transform(final_test['store_id'])
pickle.dump(lbl,open('label_encoder_store_id.sav','wb'))

In [11]:
lbl=LabelEncoder()
train['state_id']=lbl.fit_transform(train['state_id'])
test['state_id']=lbl.transform(test['state_id'])
final_test['state_id']=lbl.transform(final_test['state_id'])
pickle.dump(lbl,open('label_encoder_state_id.sav','wb'))

In [16]:
# Handle event_name_1 encoding
train['event_name_1'] = train['event_name_1'].fillna('no_event')
test['event_name_1'] = test['event_name_1'].fillna('no_event')
final_test['event_name_1'] = final_test['event_name_1'].fillna('no_event')

# Ensure all values are strings
train['event_name_1'] = train['event_name_1'].astype(str)
test['event_name_1'] = test['event_name_1'].astype(str)
final_test['event_name_1'] = final_test['event_name_1'].astype(str)

# Combine all values for fitting
all_values = np.concatenate([
    train['event_name_1'].values,
    test['event_name_1'].values,
    final_test['event_name_1'].values
])

# Fit and transform
lbl = LabelEncoder()
lbl.fit(all_values)
train['event_name_1'] = lbl.transform(train['event_name_1'])
test['event_name_1'] = lbl.transform(test['event_name_1'])
final_test['event_name_1'] = lbl.transform(final_test['event_name_1'])

pickle.dump(lbl, open('label_encoder_event_name_1.sav', 'wb'))

In [17]:
# Handle event_name_2 encoding
train['event_name_2'] = train['event_name_2'].fillna('no_event')
test['event_name_2'] = test['event_name_2'].fillna('no_event')
final_test['event_name_2'] = final_test['event_name_2'].fillna('no_event')

# Ensure all values are strings
train['event_name_2'] = train['event_name_2'].astype(str)
test['event_name_2'] = test['event_name_2'].astype(str)
final_test['event_name_2'] = final_test['event_name_2'].astype(str)

# Combine all values for fitting
all_values = np.concatenate([
    train['event_name_2'].values,
    test['event_name_2'].values,
    final_test['event_name_2'].values
])

# Fit and transform
lbl = LabelEncoder()
lbl.fit(all_values)
train['event_name_2'] = lbl.transform(train['event_name_2'])
test['event_name_2'] = lbl.transform(test['event_name_2'])
final_test['event_name_2'] = lbl.transform(final_test['event_name_2'])

pickle.dump(lbl, open('label_encoder_event_name_2.sav', 'wb'))

In [18]:
# Handle event_type_1 encoding
train['event_type_1'] = train['event_type_1'].fillna('no_event')
test['event_type_1'] = test['event_type_1'].fillna('no_event')
final_test['event_type_1'] = final_test['event_type_1'].fillna('no_event')

# Ensure all values are strings
train['event_type_1'] = train['event_type_1'].astype(str)
test['event_type_1'] = test['event_type_1'].astype(str)
final_test['event_type_1'] = final_test['event_type_1'].astype(str)

# Combine all values for fitting
all_values = np.concatenate([
    train['event_type_1'].values,
    test['event_type_1'].values,
    final_test['event_type_1'].values
])

# Fit and transform
lbl = LabelEncoder()
lbl.fit(all_values)
train['event_type_1'] = lbl.transform(train['event_type_1'])
test['event_type_1'] = lbl.transform(test['event_type_1'])
final_test['event_type_1'] = lbl.transform(final_test['event_type_1'])

pickle.dump(lbl, open('label_encoder_event_type_1.sav', 'wb'))

In [19]:
# Handle event_type_2 encoding
train['event_type_2'] = train['event_type_2'].fillna('no_event')
test['event_type_2'] = test['event_type_2'].fillna('no_event')
final_test['event_type_2'] = final_test['event_type_2'].fillna('no_event')

# Ensure all values are strings
train['event_type_2'] = train['event_type_2'].astype(str)
test['event_type_2'] = test['event_type_2'].astype(str)
final_test['event_type_2'] = final_test['event_type_2'].astype(str)

# Combine all values for fitting
all_values = np.concatenate([
    train['event_type_2'].values,
    test['event_type_2'].values,
    final_test['event_type_2'].values
])

# Fit and transform
lbl = LabelEncoder()
lbl.fit(all_values)
train['event_type_2'] = lbl.transform(train['event_type_2'])
test['event_type_2'] = lbl.transform(test['event_type_2'])
final_test['event_type_2'] = lbl.transform(final_test['event_type_2'])

pickle.dump(lbl, open('label_encoder_event_type_2.sav', 'wb'))

In [20]:
lbl=LabelEncoder()
train['event_type_1']=train['event_type_1'].fillna('no_event')
test['event_type_1']=test['event_type_1'].fillna('no_event')
final_test['event_type_1']=final_test['event_type_1'].fillna('no_event')
train['event_type_1']=lbl.fit_transform(train['event_type_1'])
test['event_type_1']=lbl.transform(test['event_type_1'])
final_test['event_type_1']=lbl.transform(final_test['event_type_1'])
pickle.dump(lbl,open('label_encoder_event_type_1.sav','wb'))

In [21]:
lbl=LabelEncoder()
train['event_type_2']=train['event_type_2'].fillna('no_event')
test['event_type_2']=test['event_type_2'].fillna('no_event')
final_test['event_type_2']=final_test['event_type_2'].fillna('no_event')
train['event_type_2']=lbl.fit_transform(train['event_type_2'])
test['event_type_2']=lbl.transform(test['event_type_2'])
final_test['event_type_2']=lbl.transform(final_test['event_type_2'])
pickle.dump(lbl,open('label_encoder_event_type_2.sav','wb'))

In [22]:
lbl=LabelEncoder()
train['year']=lbl.fit_transform(train['year'])
test['year']=lbl.transform(test['year'])
final_test['year']=lbl.transform(final_test['year'])
pickle.dump(lbl,open('label_encoder_year.sav','wb'))

After the data reducing has been done, we can remove unnecessary columns. Firstly, let's convert all 3 state SNAPs into one feature named SNAP.

In [23]:
%%time
train.loc[train['state_id'] == 'CA', 'snap'] = train.loc[train['state_id'] == 'CA']['snap_CA']
train.loc[train['state_id'] == 'TX', 'snap'] = train.loc[train['state_id'] == 'TX']['snap_TX']
train.loc[train['state_id'] == 'WI', 'snap'] = train.loc[train['state_id'] == 'WI']['snap_WI']
train.drop(['snap_CA','snap_TX','snap_WI'],axis=1,inplace=True)


test.loc[test['state_id'] == 'CA', 'snap'] = test.loc[test['state_id'] == 'CA']['snap_CA']
test.loc[test['state_id'] == 'TX', 'snap'] = test.loc[test['state_id'] == 'TX']['snap_TX']
test.loc[test['state_id'] == 'WI', 'snap'] = test.loc[test['state_id'] == 'WI']['snap_WI']
test.drop(['snap_CA','snap_TX','snap_WI'],axis=1,inplace=True)

final_test.loc[final_test['state_id'] == 'CA', 'snap'] = final_test.loc[final_test['state_id'] == 'CA']['snap_CA']
final_test.loc[final_test['state_id'] == 'TX', 'snap'] = final_test.loc[final_test['state_id'] == 'TX']['snap_TX']
final_test.loc[final_test['state_id'] == 'WI', 'snap'] = final_test.loc[final_test['state_id'] == 'WI']['snap_WI']
final_test.drop(['snap_CA','snap_TX','snap_WI'],axis=1,inplace=True)

CPU times: total: 1min 14s
Wall time: 1min 53s


Weekday = wday are similar features so there is no need to keep it. The same reason for having wm_yr_wk feature

In [24]:
%%time
train.drop('weekday',axis=1,inplace=True)
train.drop('wm_yr_wk',axis=1,inplace=True)
 
test.drop('weekday',axis=1,inplace=True)
test.drop('wm_yr_wk',axis=1,inplace=True)

final_test.drop('weekday',axis=1,inplace=True)
final_test.drop('wm_yr_wk',axis=1,inplace=True)

CPU times: total: 1min 13s
Wall time: 1min 28s


FEATURES THAT INCLUDE TIME INTERVALS

a) Number of the week - I created the function to get the week number of particular date

In [25]:
def get_week_number(x):
    date=calendar.datetime.date.fromisoformat(x)
    return date.isocalendar()[1]

In [26]:
train['week_number']=train['date'].apply(lambda x:get_week_number(x))
test['week_number']=test['date'].apply(lambda x:get_week_number(x))
final_test['week_number']=final_test['date'].apply(lambda x:get_week_number(x))

b) Season of the year - A function that is used to get season according to the month

In [27]:
def get_season(x):
    if x in [12,1,2]:
        return 0      #"Winter"
    elif x in [3,4,5]:
        return 1   #"Spring"
    elif x in [6,7,8]:
        return 2   #"Summer"
    else:
        return 3   #"Autumn"

In [28]:
train['season']=train['month'].apply(lambda x:get_season(x))
test['season']=test['month'].apply(lambda x:get_season(x))
final_test['season']=final_test['month'].apply(lambda x:get_season(x))

c) Start of a quarter - A function used to check which day starts the quarter

In [30]:
def check_if_quarter_begin(x):
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==1 and (month in [1,4,7,9])) else 0

In [32]:
train['quarter_start']=train['date'].apply(lambda x:check_if_quarter_begin(x))
test['quarter_start']=test['date'].apply(lambda x:check_if_quarter_begin(x))
final_test['quarter_start']=final_test['date'].apply(lambda x:check_if_quarter_begin(x))

d) End of a quarter - A function used to check which day ends the quarter

In [33]:
def check_if_quarter_end(x):
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    if (day==31 and month==3) or (day==30 and month==6) or (day==30 and month==9) or (day==31 and month==12):
        return 1
    else:
        return 0

In [34]:
train['quarter_end']=train['date'].apply(lambda x:check_if_quarter_end(x))
test['quarter_end']=test['date'].apply(lambda x:check_if_quarter_end(x))
final_test['quarter_end']=final_test['date'].apply(lambda x:check_if_quarter_end(x))

e) Start of a month - The function below checks if the day is beginning of the month

In [35]:

def month_start(x):
    day=calendar.datetime.date.fromisoformat(x).day
    return 1 if day==1 else 0

In [36]:
train['month_start']=train['date'].apply(lambda x:month_start(x))
test['month_start']=test['date'].apply(lambda x:month_start(x))
final_test['month_start']=final_test['date'].apply(lambda x:month_start(x))

f) End of a month - The function below checks if the day is end of the month

In [38]:
def month_end(x):
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    year=calendar.datetime.date.fromisoformat(x).year
    leap_yr=(year%4==0) # Checking if it is a leap year
    val=(day==31 and month==1) or (day==29 if leap_yr else day==28) or (day==31 and month==3) or (day==30 and month==4) or\
        (day==31 and month==5) or (day==30 and month==6) or (day==31 and month==7) or (day==31 and month==8) or\
        (day==30 and month==9) or (day==31 and month==10) or (day==30 and month==11) or (day==31 and month==12)
    return 1 if val else 0

In [39]:

train['month_end']=train['date'].apply(lambda x:month_end(x))
test['month_end']=test['date'].apply(lambda x:month_end(x))
final_test['month_end']=final_test['date'].apply(lambda x:month_end(x))

g) Start of a year - The function checking if a given day is the beginning of a year

In [40]:
def year_start(x):
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==1 and month==1) else 0

In [41]:
train['year_start']=train['date'].apply(lambda x:year_start(x))
test['year_start']=test['date'].apply(lambda x:year_start(x))
final_test['year_start']=final_test['date'].apply(lambda x:year_start(x))

h) End of a year - The function checking if a given day is the end of a year

In [42]:
def year_end(x):
    day=calendar.datetime.date.fromisoformat(x).day
    month=calendar.datetime.date.fromisoformat(x).month
    return 1 if (day==31 and month==12) else 0

In [43]:
train['year_end']=train['date'].apply(lambda x:year_end(x))
test['year_end']=test['date'].apply(lambda x:year_end(x))
final_test['year_end']=final_test['date'].apply(lambda x:year_end(x))