<a href="https://colab.research.google.com/github/helinatefera/10xWeek4/blob/task-2/notebooks/task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import datetime as datetime

In [18]:
clean_data = pd.read_csv('/content/clean_data.csv')
store = pd.read_csv('/content/store.csv')

In [12]:
clean_data.head()

Unnamed: 0,Date,Store,Store_Type,Store_Status,Promo,Promo2,School_Holiday,Customers,Sales,DayOfWeek,is_holiday
0,2021-01-01,1,supermarket,1,0,0,0,124,0,4,1
1,2021-01-02,1,pharmacy,1,0,0,0,87,0,5,0
2,2021-01-03,1,supermarket,1,1,0,0,74,778,6,0
3,2021-01-04,1,supermarket,0,1,0,0,0,0,0,0
4,2021-01-05,1,pharmacy,1,1,0,0,76,1002,1,0


In [19]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [30]:
X = clean_data.drop("Sales", axis=1)
y = clean_data["Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(n_estimators=50, random_state=42)),
    ]
)

pipeline.fit(X_train, y_train)

cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print(f"Cross-validated MAE: {-cross_val_scores.mean()}")

Cross-validated MAE: 52.84986872146119


In [26]:
clean_data['Date'] = pd.to_datetime(clean_data['Date'])
clean_data['Year'] = clean_data['Date'].dt.year
clean_data['Month'] = clean_data['Date'].dt.month
clean_data['WeekOfYear'] = clean_data['Date'].dt.isocalendar().week
clean_data['DayOfMonth'] = clean_data['Date'].dt.day
clean_data['IsWeekend'] = clean_data['DayOfWeek'].isin([6, 7]).astype(int)  # 1 for Saturday and Sunday
clean_data['IsMonthStart'] = clean_data['Date'].dt.is_month_start.astype(int)
clean_data['IsMonthEnd'] = clean_data['Date'].dt.is_month_end.astype(int)
clean_data['PromoDuration'] = clean_data.groupby('Store')['Promo'].cumsum()  # Count consecutive promo days
clean_data['PromoOverlap'] = ((clean_data['Promo'] == 1) & (clean_data['Promo2'] == 1)).astype(int)  # Promo overlap

In [27]:
clean_data.head()

Unnamed: 0,Date,Store,Store_Type,Store_Status,Promo,Promo2,School_Holiday,Customers,Sales,DayOfWeek,...,DayOfMonth,WeekOfYear,Quarter,IsMonthStart,IsMonthEnd,Year,Month,IsWeekend,PromoDuration,PromoOverlap
0,2021-01-01,1,supermarket,1,0,0,0,124,0,4,...,1,53,1,1,0,2021,1,0,0,0
1,2021-01-02,1,pharmacy,1,0,0,0,87,0,5,...,2,53,1,0,0,2021,1,0,0,0
2,2021-01-03,1,supermarket,1,1,0,0,74,778,6,...,3,53,1,0,0,2021,1,1,1,0
3,2021-01-04,1,supermarket,0,1,0,0,0,0,0,...,4,1,1,0,0,2021,1,0,2,0
4,2021-01-05,1,pharmacy,1,1,0,0,76,1002,1,...,5,1,1,0,0,2021,1,0,3,0


In [32]:
avg_sales = clean_data['Sales'].mean()
percentage_error = (50.5 / avg_sales) * 100
print(f"Percentage Error: {percentage_error:.2f}%")

Percentage Error: 16.59%
