In [28]:
import boto3
import os

# create local folder if it doesn't exist
os.makedirs("favorita", exist_ok=True)

s3 = boto3.client('s3')
bucket = "nexttrendco"
prefix = "favorita/"

files = [
    "train.csv", "transactions.csv", "stores.csv",
    "items.csv", "oil.csv", "holidays_events.csv"
]

for file in files:
    s3.download_file(bucket, prefix + file, f"favorita/{file}")
    print(f"Downloaded {file} to favorita/{file}")

Downloaded train.csv to favorita/train.csv
Downloaded transactions.csv to favorita/transactions.csv
Downloaded stores.csv to favorita/stores.csv
Downloaded items.csv to favorita/items.csv
Downloaded oil.csv to favorita/oil.csv
Downloaded holidays_events.csv to favorita/holidays_events.csv


In [29]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df_sales = pd.read_csv("favorita/train.csv", nrows=200000, low_memory=False)  
df_transactions = pd.read_csv("favorita/transactions.csv")
df_stores = pd.read_csv("favorita/stores.csv")
df_items = pd.read_csv("favorita/items.csv")
df_oil = pd.read_csv("favorita/oil.csv")
df_holidays = pd.read_csv("favorita/holidays_events.csv")


In [30]:
# merge datasets
df = df_sales.merge(df_stores, on='store_nbr', how='left')
df = df.merge(df_items, on='item_nbr', how='left')
df = df.merge(df_transactions, on=['date', 'store_nbr'], how='left')

df['date'] = pd.to_datetime(df['date'])
df_oil['date'] = pd.to_datetime(df_oil['date'])
df = df.merge(df_oil, on='date', how='left')

df_holidays['date'] = pd.to_datetime(df_holidays['date'])
df = df.merge(df_holidays, on='date', how='left')

In [31]:
# perform data scrub
df.drop_duplicates(inplace=True)
df['date'] = pd.to_datetime(df['date'])
df = df[df['unit_sales'] >= 0]  # Remove negatives

#fFill missing numerical with median
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# fill missing categorical with mode
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [34]:
# 3. feature engineering
df['dayofweek'] = df['date'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
df = df.sort_values(['item_nbr', 'date'])

df['lag_7'] = df.groupby('item_nbr')['unit_sales'].shift(7)
df['rolling_7'] = df.groupby('item_nbr')['unit_sales'].transform(lambda x: x.rolling(7, min_periods=1).mean())


In [39]:
# fill missing lag/rolling values
df['lag_7'] = df['lag_7'].fillna(0)
df['rolling_7'] = df['rolling_7'].fillna(df['lag_7'])
df['onpromotion'] = df['onpromotion'].fillna(0)
df['promo_lag_interaction'] = df['onpromotion'] * df['lag_7']

# confirm shape and nulls
print("df shape before scaling:", df.shape)
print("Nulls in scaled features:")
print(df[['lag_7', 'rolling_7', 'promo_lag_interaction']].isnull().sum())

df shape before scaling: (199989, 25)
Nulls in scaled features:
lag_7                    0
rolling_7                0
promo_lag_interaction    0
dtype: int64


In [40]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = ['lag_7', 'rolling_7', 'promo_lag_interaction']
df[scaled_features] = scaler.fit_transform(df[scaled_features])

In [41]:
# feature Transformation
cat_vars = ['family', 'city', 'state', 'type_y']
df = pd.get_dummies(df, columns=cat_vars, drop_first=True)

scaler = MinMaxScaler()
scaled_features = ['lag_7', 'rolling_7', 'promo_lag_interaction']
df[scaled_features] = scaler.fit_transform(df[scaled_features])

In [44]:
# feature Transformation
# ensure scaled features are clean
scaled_features = ['lag_7', 'rolling_7', 'promo_lag_interaction']
df[scaled_features] = df[scaled_features].fillna(0)

# 2. normalize numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[scaled_features] = scaler.fit_transform(df[scaled_features])

print("feature transformation complete. Shape:", df.shape)

feature transformation complete. Shape: (199989, 74)


In [47]:
# sort by date to maintain time order
df = df.sort_values('date')

# create splits: 70% train, 15% val, 15% test
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.15)

df_train = df.iloc[:train_size]
df_val = df.iloc[train_size:train_size + val_size]
df_test = df.iloc[train_size + val_size:]

print("Train size:", len(df_train))
print("Val size:", len(df_val))
print("Test size:", len(df_test))


Train size: 139992
Val size: 29998
Test size: 29999


In [48]:
# save processed splits
df_train.to_csv("train.csv", index=False)
df_val.to_csv("val.csv", index=False)
df_test.to_csv("test.csv", index=False)

In [49]:
# upload to S3
import boto3

s3 = boto3.client('s3')
s3_bucket = "nexttrendco"

datasets = {
    "train.csv": "favorita/processed/train.csv",
    "val.csv": "favorita/processed/val.csv",
    "test.csv": "favorita/processed/test.csv"
}

for local_file, s3_key in datasets.items():
    s3.upload_file(local_file, s3_bucket, s3_key)
    print(f"uploaded {local_file} to s3://{s3_bucket}/{s3_key}")


uploaded train.csv to s3://nexttrendco/favorita/processed/train.csv
uploaded val.csv to s3://nexttrendco/favorita/processed/val.csv
uploaded test.csv to s3://nexttrendco/favorita/processed/test.csv


In [50]:
# adjust path if files are in a subfolder
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")

# Take 1000-row samples (adjust if needed)
train_df.sample(1000, random_state=42).to_csv("train_sample.csv", index=False)
val_df.sample(1000, random_state=42).to_csv("val_sample.csv", index=False)
test_df.sample(1000, random_state=42).to_csv("test_sample.csv", index=False)

print("Sample CSVs created successfully ")


Sample CSVs created successfully 
