# **Feature Engineerig**

In [None]:
!cp /content/drive/MyDrive/rossmann_dw.db /content/

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# اتصال به دیتابیس
engine = create_engine('sqlite:///rossmann_dw.db')

# خواندن جدول‌ها
fact_sales = pd.read_sql('SELECT * FROM fact_sales', engine)
dim_store = pd.read_sql('SELECT * FROM dim_store', engine)
dim_date = pd.read_sql('SELECT * FROM dim_date', engine)

df = fact_sales.merge(dim_store, on='store_id', how='left')
df = df.merge(dim_date, on='date_id', how='left')

In [None]:
df.columns

Index(['store_id', 'date_id', 'sales', 'customers', 'promo', 'open',
       'school_holiday', 'state_holiday', 'store_type', 'assortment_type',
       'competition_distance', 'competition_open_month',
       'competition_open_year', 'promo2', 'promo2_since_week',
       'promo2_since_year', 'promo_interval', 'day', 'month', 'year',
       'weekday', 'is_weekend'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np

def safe_promo2_since(row):
    year = int(row['promo2_since_year'])
    week = int(row['promo2_since_week'])

    # Skip zero or invalid years/weeks
    if year < 1900 or week < 1 or week > 53:
        return pd.NaT

    try:
        return pd.to_datetime(f'{year}-W{week:02d}-1', format='%G-W%V-%u')
    except:
        return pd.NaT


def safe_competition_open(row):
    year = int(row['competition_open_year'])
    month = int(row['competition_open_month'])

    if year < 1900 or month < 1 or month > 12:
        return pd.NaT

    try:
        return pd.Timestamp(year=year, month=month, day=1)
    except:
        return pd.NaT


def feature_engineering(df, model):
    df = df.copy()
    df['date_id'] = pd.to_datetime(df['date_id'])

    # Basic date-based features
    df['day'] = df['date_id'].dt.day
    df['month'] = df['date_id'].dt.month
    df['year'] = df['date_id'].dt.year
    df['weekday'] = df['date_id'].dt.weekday
    df['is_weekend'] = df['weekday'] >= 5
    df['weekofyear'] = df['date_id'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date_id'].dt.quarter
    df['is_month_start'] = df['date_id'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date_id'].dt.is_month_end.astype(int)

    # Promo interval feature
    month_str = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                 'Jul', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']
    df['promo_interval'] = df['promo_interval'].fillna('')
    df['promo2_month'] = df['date_id'].dt.month.map(lambda x: month_str[x - 1])
    df['is_promo2_month'] = df.apply(lambda row: 1 if row['promo2_month'] in row['promo_interval'].split(',') else 0, axis=1)

    df['competition_open_since'] = df.apply(safe_competition_open, axis=1)

    df['competition_duration_months'] = (
    (df['date_id'] - df['competition_open_since']).dt.days / 30).clip(lower=0).fillna(0)

    # Promo2 duration in weeks
    df['promo2_since'] = df.apply(safe_promo2_since, axis=1)

    df['promo2_duration_weeks'] = (
        (df['date_id'] - df['promo2_since']).dt.days / 7).clip(lower=0).fillna(0)

    # --- Aggregated shop-level features ---
    shop_agg = df.groupby('store_id').agg(
        shopavg_open=('open', 'mean'),
        shopavg_salespercustomer=('sales', lambda x: x.sum() / df.loc[x.index, 'customers'].sum() if df.loc[x.index, 'customers'].sum() != 0 else 0),
        shopavg_schoolholiday=('school_holiday', 'mean')
    ).reset_index()

    shop_holiday_sales = df[df['state_holiday'] != 0].groupby('store_id')['sales'].mean().reset_index().rename(columns={'sales': 'shopsales_holiday'})
    shop_promo_sales = df[df['promo'] == 1].groupby('store_id')['sales'].mean().reset_index().rename(columns={'sales': 'shopsales_promo'})

    # Average sales during school holidays per store
    school_holiday_sales = df[df['school_holiday'] == 1].groupby('store_id')['sales'].mean().reset_index()
    school_holiday_sales.rename(columns={'sales': 'shopsales_schoolholiday'}, inplace=True)


    # Merge all shop aggregates into one dataframe
    shop_agg = shop_agg.merge(shop_holiday_sales, on='store_id', how='left')
    shop_agg = shop_agg.merge(shop_promo_sales, on='store_id', how='left')
    shop_agg = shop_agg.merge(school_holiday_sales, on='store_id', how='left')
    shop_agg.fillna(0, inplace=True)


    # Fill NaNs
    shop_agg.fillna(0, inplace=True)

    # Merge back to main dataframe
    df = df.merge(shop_agg, on='store_id', how='left')

    if model == 'tree':
      cat_cols = ['state_holiday', 'store_type', 'assortment_type', 'store_id']
      for col in cat_cols:
          df[col] = df[col].astype('category').cat.codes
    else:
      cat_cols = ['state_holiday', 'store_type', 'assortment_type']
      df = pd.get_dummies(df, columns=cat_cols, prefix=cat_cols, dtype=int, drop_first=True)

    # Drop temporary columns
    df.drop(columns=['promo2_month', 'promo2_since', 'competition_open_since', 'customers', 'promo_interval'], inplace=True)

    return df


In [None]:
final_df = feature_engineering(df, 'tree')
final_df.to_csv("engineered_features_tree.csv", index=False)

In [None]:
final_df = feature_engineering(df, 'deep')
final_df.to_csv("engineered_features_deep.csv", index=False)

In [None]:
!cp /content/engineered_features_tree.csv /content/drive/MyDrive/

!cp /content/engineered_features_deep.csv /content/drive/MyDrive/