In [2]:
import pandas as pd
import boto3

s3 = boto3.client('s3')
bucket_name = 'nexttrendco'

# Specify the file names
walmart_files = ['walmart/train.csv', 'walmart/test.csv', 'walmart/features.csv']
file_paths = [file.split('/')[-1] for file in walmart_files]

# Download Walmart files from S3 to local directory
for s3_key, local_file in zip(walmart_files, file_paths):
    s3.download_file(bucket_name, s3_key, local_file)
    print(f"Downloaded {s3_key} to {local_file}")

# Load data into pandas dataframes
train_walmart = pd.read_csv('train.csv')
test_walmart = pd.read_csv('test.csv')
features_walmart = pd.read_csv('features.csv')

# view first few rows 
train_walmart.head()

Downloaded walmart/train.csv to train.csv
Downloaded walmart/test.csv to test.csv
Downloaded walmart/features.csv to features.csv


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [4]:
# check for missing values
missing_data = train_walmart.isnull().sum()
print(missing_data[missing_data > 0])  # Display columns with missing values

Series([], dtype: int64)


In [6]:
# feature engineering
train_walmart['lag_1'] = train_walmart['Weekly_Sales'].shift(1)
train_walmart['lag_7'] = train_walmart['Weekly_Sales'].shift(7)

# RM
train_walmart['rolling_mean_7'] = train_walmart['Weekly_Sales'].rolling(window=7).mean()

# Interaction Features 
train_walmart['holiday_sales_interaction'] = train_walmart['IsHoliday'] * train_walmart['Weekly_Sales']

# Convert Date column to datetime format
train_walmart['Date'] = pd.to_datetime(train_walmart['Date'])

# extract additional date features
train_walmart['day_of_week'] = train_walmart['Date'].dt.dayofweek
train_walmart['month'] = train_walmart['Date'].dt.month

# Check the first few rows to verify
train_walmart.head()


Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday'], dtype='object')
