## Feature Engineering

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from dotenv import load_dotenv
import os

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)
load_dotenv() 
data_path = os.getenv("CLEANED_DATA")
df = read_data(data_path)

In [None]:
def convert_to_datetime(dataframe, datetime_columns):
    for col in datetime_columns:
        dataframe[col] = pd.to_datetime(dataframe[col], errors="coerce")  # Convert to datetime
    return dataframe

# Assuming the DataFrame is named `df`, identify the datetime columns
datetime_columns = [
    "DURAKGIRISTARIHI",
    "DURAKCIKISTARIHI",
    "HATBASLANGICTARIHI",
    "HATBITISTARIHI",
    "INSERTDATE",
]

# Convert datetime columns in the DataFrame
df = convert_to_datetime(df, datetime_columns)

In [None]:
def preprocess_data(df):
    """
    Extracts date-related features from the 'HATBASLANGICTARIHI' column.

    Specifically, it adds new columns to the DataFrame:
    - 'DAY_OF_WEEK': Day of the week as an integer (Monday=0, Sunday=6)
    - 'MONTH': Month number (1 to 12)
    - 'HOUR': Hour of the day (0 to 23)

    These features are commonly used in time-based modeling or analysis.
    """
    df['DAY_OF_WEEK'] = df['HATBASLANGICTARIHI'].dt.weekday
    df['MONTH'] = df['HATBASLANGICTARIHI'].dt.month
    df['HOUR'] = df['HATBASLANGICTARIHI'].dt.hour
    return df

In [None]:
def add_holiday_info(df):
    public_holidays = {

        """
        Adds a new column indicating whether each route date falls on a public holiday.

        The function checks dates in the 'HATBASLANGICTARIHI' column.
        If the date matches a predefined Turkish public holiday (2020–2024),
        the 'HOLIDAY_CATEGORY' column is set to 'Holiday'; otherwise, it is set to 'Normal'.
        """

        # 2020
        '2020-01-01', '2020-04-23', '2020-05-01', '2020-05-19', '2020-05-23', '2020-05-24', '2020-05-25',
        '2020-07-15', '2020-07-30', '2020-07-31', '2020-08-01', '2020-08-02', '2020-08-30', '2020-10-29',

        # 2021
        '2021-01-01', '2021-04-23', '2021-05-01', '2021-05-13', '2021-05-14', '2021-05-15', '2021-07-15',
        '2021-07-19', '2021-07-20', '2021-07-21', '2021-07-22', '2021-08-30', '2021-10-29',

        # 2022
        '2022-01-01', '2022-04-23', '2022-05-01', '2022-05-02', '2022-05-03', '2022-05-04',
        '2022-07-08', '2022-07-09', '2022-07-10', '2022-07-11', '2022-07-15', '2022-08-30', '2022-10-29',

        # 2023 
        '2023-01-01', '2023-04-20', '2023-04-21', '2023-04-22', '2023-04-23',
        '2023-06-27', '2023-06-28', '2023-06-29', '2023-06-30',
        '2023-05-19', '2023-08-30', '2023-10-29',

        # 2024 
        '2024-01-01', '2024-04-09', '2024-04-10', '2024-04-11', '2024-04-12',
        '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08', '2024-06-09',
        '2024-04-23', '2024-05-19', '2024-08-30', '2024-10-29',
    }

    df['HOLIDAY_CATEGORY'] = df['HATBASLANGICTARIHI'].dt.strftime('%Y-%m-%d').apply(
        lambda x: 'Holiday' if x in public_holidays else 'Normal'
    )

    return df


In [None]:
def categorize_pandemic_condition(df, date_column):
    """
    Function that determines pandemic conditions.
    Pandemic status is simplified to "Pandemic" or "No Pandemic".
    """
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    
    conditions = [
        ((df[date_column] >= '2020-03-16') & (df[date_column] <= '2020-07-03'), 'Pandemic'),
        ((df[date_column] >= '2020-03-20') & (df[date_column] <= '2020-06-01'), 'Pandemic'),
        ((df[date_column] >= '2021-04-29') & (df[date_column] <= '2021-05-17'), 'Pandemic'),
        ((df[date_column] >= '2020-05-23') & (df[date_column] <= '2020-05-25'), 'Pandemic'),
        ((df[date_column] >= '2021-05-13') & (df[date_column] <= '2021-05-15'), 'Pandemic'),
        ((df[date_column] >= '2020-07-04') & (df[date_column] <= '2020-08-31'), 'Pandemic'),
        ((df[date_column] >= '2021-07-19') & (df[date_column] <= '2021-07-23'), 'Pandemic')
    ]
    
    df['PANDEMIC_CONDITION'] = 'No Pandemic'
    
    for condition, label in conditions:
        df.loc[condition, 'PANDEMIC_CONDITION'] = label
    
    return df


In [None]:
def categorize_school_status(df, date_column):
    """
    Function that determines the school status.
    The school status is labeled as 'School Open' or 'School Closed'.
    """
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.normalize()

    school_conditions = [
        # 2019
        ((df[date_column] == '2019-01-01'), 'School Closed'),
        ((df[date_column] >= '2019-01-02') & (df[date_column] <= '2019-06-14'), 'School Open'),
        ((df[date_column] >= '2019-06-15') & (df[date_column] <= '2019-09-08'), 'School Closed'),
        ((df[date_column] >= '2019-09-09') & (df[date_column] <= '2019-12-31'), 'School Open'),

        # 2020
        ((df[date_column] == '2020-01-01'), 'School Closed'),
        ((df[date_column] >= '2020-01-02') & (df[date_column] <= '2020-03-22'), 'School Open'),
        ((df[date_column] >= '2020-03-23') & (df[date_column] <= '2020-08-30'), 'School Closed'),
        ((df[date_column] >= '2020-08-31') & (df[date_column] <= '2020-12-31'), 'School Open'),

        # 2021
        ((df[date_column] == '2021-01-01'), 'School Closed'),
        ((df[date_column] >= '2021-01-02') & (df[date_column] <= '2021-06-18'), 'School Open'),
        ((df[date_column] >= '2021-06-19') & (df[date_column] <= '2021-09-05'), 'School Closed'),
        ((df[date_column] >= '2021-09-06') & (df[date_column] <= '2021-12-31'), 'School Open'),

        # 2022
        ((df[date_column] == '2022-01-01'), 'School Closed'),
        ((df[date_column] >= '2022-01-02') & (df[date_column] <= '2022-01-21'), 'School Open'),
        ((df[date_column] >= '2022-01-22') & (df[date_column] <= '2022-02-06'), 'School Closed'),
        ((df[date_column] >= '2022-02-07') & (df[date_column] <= '2022-06-17'), 'School Open'),
        ((df[date_column] >= '2022-06-18') & (df[date_column] <= '2022-09-11'), 'School Open'),
        ((df[date_column] >= '2022-09-12') & (df[date_column] <= '2022-12-31'), 'School Open'),

        # 2023
        ((df[date_column] == '2023-01-01'), 'School Closed'),
        ((df[date_column] >= '2023-01-02') & (df[date_column] <= '2023-06-16'), 'School Open'),
        ((df[date_column] >= '2023-06-17') & (df[date_column] <= '2023-09-10'), 'School Closed'),
        ((df[date_column] >= '2023-09-11') & (df[date_column] <= '2023-12-31'), 'School Open'),

        # 2024
        ((df[date_column] == '2024-01-01'), 'School Closed'),
        ((df[date_column] >= '2024-01-02') & (df[date_column] <= '2024-06-14'), 'School Open'),
    ]

    
    df['SCHOOL_STATUS'] = 'Unknown'

    for condition, label in school_conditions:
        df.loc[condition, 'SCHOOL_STATUS'] = label

    return df


In [None]:
def process_weather_data(weather_file):
    """
    Processes weather data.
    """
    df_weather = pd.read_csv(weather_file)
    df_weather['dt_iso'] = df_weather['dt_iso'].str.replace(r' \+0000 UTC$', '', regex=True)
    df_weather['dt_iso'] = pd.to_datetime(df_weather['dt_iso'])
    return df_weather


In [None]:
def find_nearest_weather(date, df_weather):
    """
   Returns the weather information closest to the given date.
    """
    diffs = (df_weather['dt_iso'] - date).abs()
    if diffs.empty or diffs.isna().all():
        return np.nan
    nearest_weather_row = df_weather.iloc[diffs.idxmin()]
    return nearest_weather_row[['temp', 'weather_description']]

In [None]:
def simplify_weather_data(df):
    """
    It only simplifies the weather_description column and removes the weather_main column.
    """

    # weather_description sadeleştirilmiş kategoriler
    weather_description_mapping = {
        'heavy intensity rain': 'Precipitation',
        'moderate rain': 'Precipitation',
        'light rain': 'Precipitation',
        'shower rain': 'Precipitation',
        'light shower sleet': 'Precipitation',
        'heavy intensity shower rain': 'Precipitation',
        'light intensity shower rain': 'Precipitation',
        'light shower snow': 'Precipitation',
        'sky is clear': 'Clear',
        'clear sky': 'Clear',
        'few clouds': 'Cloudy',
        'scattered clouds': 'Cloudy',
        'broken clouds': 'Cloudy',
        'overcast clouds': 'Cloudy',
        'fog': 'Low Visibility',
        'mist': 'Low Visibility',
        'haze': 'Low Visibility',
        'smoke': 'Low Visibility',
        'light snow': 'Precipitation',
        'thunderstorm': 'Storm',
        'thunderstorm with heavy rain': 'Storm',
        'thunderstorm with light rain': 'Storm',
        'thunderstorm with rain': 'Storm',
        'tornado': 'Storm',
        'dust': 'Low Visibility'
    }

    df['weather_description'] = df['weather_description'].replace(weather_description_mapping)

    return df

In [None]:
def merge_weather_data(df, df_weather, date_column):

    df_weather = simplify_weather_data(df_weather)
    
    df[['weather_temp', 'weather_description']] = df[date_column].apply(lambda x: find_nearest_weather(x, df_weather)).apply(pd.Series)
    
    return df

In [None]:
def prepare_features(df):
    """
    Converts selected categorical columns into dummy/one-hot encoded variables.

    This transformation prepares the data for machine learning models by
    converting non-numeric categories into numerical binary columns.
    The first category in each column is dropped to avoid multicollinearity.
    """
    categorical_cols = ['DAY_OF_WEEK', 'HOLIDAY_CATEGORY', 'MONTH','PANDEMIC_CONDITION','SCHOOL_STATUS', 'weather_description']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    return df

In [None]:
def create_lag_features(df, lags):
    """
    Creates the 'HATSURESI' feature based on the specified lag (past) times.
    
    """
    for lag in lags:
       
        df[f'HATSURESI_LAG_{lag}'] = df.groupby("DURAKSIRANO")['HATSURESI'].transform(lambda x: x.shift(lag))
        df = df.fillna(method='bfill')
    return df

In [None]:
df_weather = process_weather_data(r"C:\Users\Excalibur\Desktop\Bitirme Projesi\Datas\800-last\weather_condition.csv")

In [None]:
df = merge_weather_data(df, df_weather, 'HATBASLANGICTARIHI')

In [None]:
df = preprocess_data(df)
df = add_holiday_info(df)
df = categorize_pandemic_condition(df, "HATBASLANGICTARIHI")
df = categorize_school_status(df, "HATBASLANGICTARIHI")

#### Categoric Numeric Analysis

In [None]:
def grab_col_names(dataframe, cat_th=15, car_th=20):
    """
    Identifies and categorizes variable types in a DataFrame.

    The function classifies variables into:
    - Categorical columns (`cat_cols`)
    - Numerical columns (`num_cols`)
    - Categorical but cardinal columns (`cat_but_car`)
    - Numerical but categorical columns (`num_but_cat` — used internally)

    Rules:
    - Variables with object type are treated as categorical.
    - Numerical variables with unique values below `cat_th` are also considered categorical.
    - Object-type variables with unique values above `car_th` are considered cardinal and excluded from `cat_cols`.

    Returns:
    - A list of categorical columns
    - A list of numerical columns
    - A list of categorical but cardinal columns

    Also prints summary statistics about the dataset's structure.
    """
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    
    num_but_cat = [
        col
        for col in dataframe.columns
        if dataframe[col].dtype != "O" and dataframe[col].nunique() < cat_th
    ]
    cat_but_car = [
        col
        for col in dataframe.columns
        if dataframe[col].dtype == "O" and dataframe[col].nunique() > car_th
    ]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car
    
cat_cols, num_cols, cat_but_car = grab_col_names(df)
print(cat_cols)
print(num_cols)
print(cat_but_car)

drop_cat_columns = ['HATNO', 'HATKODU', 'DURAKSIRANO', 'DURAKID']
cat_cols = [col for col in cat_cols if col not in drop_cat_columns]
cat_cols

In [None]:
df = prepare_features(df)

In [None]:
df = create_lag_features(df, [1, 2, 3, 4, 5])

In [None]:
df_data = df.drop(['PLAKA', 'HATNO', 'HATKODU', 'DURAKSIRANO', 'DURAKID', 'INSERTDATE', 'DURAKGIRISTARIHI', 'DURAKCIKISTARIHI', 'HATBITISTARIHI'], axis=1)

In [None]:
training_data = df.drop(['PLAKA', 'HATNO', 'HATKODU', 'DURAKSIRANO', 'DURAKID', 'INSERTDATE', 'DURAKGIRISTARIHI', 'DURAKCIKISTARIHI', 'HATBASLANGICTARIHI', 'HATBITISTARIHI'], axis=1)