In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [52]:
train_df = pd.read_csv("data/store-sales-time-series-forecasting/train.csv")
test_df = pd.read_csv("data/store-sales-time-series-forecasting/test.csv")
stores_df = pd.read_csv("data//store-sales-time-series-forecasting/stores.csv")
oil_df = pd.read_csv("data/store-sales-time-series-forecasting/oil.csv")
holidays_events_df = pd.read_csv("data/store-sales-time-series-forecasting/holidays_events.csv")
transactions_df = pd.read_csv("data/store-sales-time-series-forecasting/transactions.csv")

In [53]:
# Merging the common columns ('store_nbr' and 'date') in the datasets using the inner merge() function
# Merge train_df with stores_df based on 'store_nbr' column
merged_df1 = test_df.merge(stores_df, on='store_nbr', how='inner')

# # Merge merged_df1 with transactions_df based on 'date' and 'store_nbr' columns
# merged_df2 = merged_df1.merge(transactions_df, on=['date', 'store_nbr'], how='inner')

# # Merge merged_df2 with holidays_events_df based on 'date' column
# merged_df3 = merged_df1.merge(holidays_events_df, on='date', how='inner')

# # Merge merged_df3 with oil_df based on 'date' column
test_df2 = merged_df1.merge(oil_df, on='date', how='inner')

# # View the first five rows of the merged dataset
# test_df2.head()

In [56]:
oil_df.columns

Index(['date', 'dcoilwtico'], dtype='object')

In [55]:
test_df2.shape

(21384, 10)

In [48]:
test_df2 = test_df2.rename(columns={"type_x": "store_type", "type_y": "holiday_type"})

In [31]:
test_df2['date'] = pd.to_datetime(test_df2['date'])
test_df2['year'] = test_df2['date'].dt.year

In [32]:
test_df2['month'] = test_df2['date'].dt.month
test_df2['day'] = test_df2['date'].dt.day
test_df2['date'] = pd.to_datetime(test_df2['date'])
test_df2['year'] = test_df2['date'].dt.year

test_df2['day_of_week'] = test_df2['date'].dt.dayofweek

In [50]:
test_df2.shape

(21384, 10)

In [36]:
columns_to_drop = ['date','id', 'state']

In [37]:
test_df2 = test_df2.drop(columns=columns_to_drop)

In [38]:
# Define the category lists for each product category
food_families = ['BEVERAGES', 'BREAD/BAKERY', 'FROZEN FOODS', 'MEATS', 'PREPARED FOODS', 'DELI','PRODUCE', 'DAIRY','POULTRY','EGGS','SEAFOOD']
home_families = ['HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES']
clothing_families = ['LINGERIE', 'LADYSWARE']
grocery_families = ['GROCERY I', 'GROCERY II']
stationery_families = ['BOOKS', 'MAGAZINES','SCHOOL AND OFFICE SUPPLIES']
cleaning_families = ['HOME CARE', 'BABY CARE','PERSONAL CARE']
hardware_families = ['PLAYERS AND ELECTRONICS','HARDWARE']

In [39]:
test_df2['family'] = np.where(test_df2['family'].isin(food_families), 'FOODS', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(home_families), 'HOME', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(clothing_families), 'CLOTHING', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(grocery_families), 'GROCERY', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(stationery_families), 'STATIONERY', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(cleaning_families), 'CLEANING', test_df2['family'])
test_df2['family'] = np.where(test_df2['family'].isin(hardware_families), 'HARDWARE', test_df2['family'])

In [41]:
encoder = OneHotEncoder()
categorical_columns = ["family", "city"]
one_hot_encoded_test = encoder.fit_transform(test_df2[categorical_columns])

# Create column names for the one-hot encoded data
column_names = encoder.get_feature_names_out(categorical_columns)

# Convert the one-hot encoded data to a DataFrame
merged_df_encoded_test = pd.DataFrame(one_hot_encoded_test.toarray(), columns=column_names)

# Concatenate the original dataframe with the one-hot encoded data
merged_df_encoded_test = pd.concat([test_df2, merged_df_encoded_test], axis=1)

# Drop the original categorical columns
merged_df_encoded_test.drop(categorical_columns, axis=1, inplace=True)

# Print the head of the encoded DataFrame
merged_df_encoded_test.head()

Unnamed: 0,store_nbr,onpromotion,type,cluster,dcoilwtico,year,month,day,day_of_week,family_AUTOMOTIVE,...,city_Loja,city_Machala,city_Manta,city_Playas,city_Puyo,city_Quevedo,city_Quito,city_Riobamba,city_Salinas,city_Santo Domingo
0,1,0,D,13,46.8,2017,8,16,2,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,0,D,13,46.8,2017,8,16,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,2,D,13,46.8,2017,8,16,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,20,D,13,46.8,2017,8,16,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,0,D,13,46.8,2017,8,16,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [42]:
merged_df_encoded_test.shape

(21384, 45)