## Data Cleaning

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from dotenv import load_dotenv
import os

load_dotenv()

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)

data_path = os.getenv("RAW_DATA")
df = read_data(data_path)

In [None]:
def check_df(dataframe):
    print("######### Head #########")
    print(df.head())
    print("######### Tail #########")
    print(df.tail())
    print("######### Info #########")
    print(df.info())
    print("######### Shape #########")
    print(df.shape)
    print("######### Null Values #########")
    print(df.isnull().sum())

check_df(df)

df.describe()

In [None]:
# Drop rows with any missing values (e.g., from failed datetime parsing or duration calculation)
df = df.dropna()

In [None]:
def convert_to_datetime(dataframe, datetime_columns):
    """
    Converts specified columns in a DataFrame to datetime format.

    Parameters:
    ----------
    dataframe : pandas.DataFrame
        The DataFrame containing the columns to be converted.
    datetime_columns : list of str
        A list of column names in the DataFrame to convert to datetime format.

    Returns:
    -------
    pandas.DataFrame
        The original DataFrame with specified columns converted to datetime.
    """
    for col in datetime_columns:
        # Use 'coerce' to handle invalid parsing by setting them as NaT
        dataframe[col] = pd.to_datetime(dataframe[col], errors="coerce")
    return dataframe

# List of columns in `df` that are expected to contain datetime information
datetime_columns = [
    "DURAKGIRISTARIHI",
    "DURAKCIKISTARIHI",
    "HATBASLANGICTARIHI",
    "HATBITISTARIHI",
    "INSERTDATE",
]

# Apply the datetime conversion
df = convert_to_datetime(df, datetime_columns)

# Calculate route duration in minutes and store it in a new column
df["HATSURESI"] = (df["HATBITISTARIHI"] - df["HATBASLANGICTARIHI"]).dt.total_seconds() / 60


# Identify rows with non-positive durations
negative_duration = df[df["HATSURESI"] <= 0]

# Print the number of invalid duration rows
print(f"Number of non-positive durations: {negative_duration.shape[0]}")

# Optionally display those rows if any exist
if not negative_duration.empty:
    print(negative_duration)

# Remove rows with non-positive durations from the main DataFrame
df = df[df["HATSURESI"] > 0]


#### Logical Outlier Setting

In [None]:
# Define logical lower and upper bounds for route duration (in minutes)
lower_bound = 35   #  routes shorter than 35 minutes are considered too short
upper_bound = 97   #  routes longer than 97 minutes are considered too long

# Identify rows with duration outside the logical bounds
logical_outliers = df[(df["HATSURESI"] < lower_bound) | (df["HATSURESI"] > upper_bound)]
print(f"Number of logical outliers: {logical_outliers.shape[0]}")

# Filter the DataFrame to keep only logically valid durations
df_cleaned = df[(df["HATSURESI"] >= lower_bound) & (df["HATSURESI"] <= upper_bound)]

# Remove rows where route start year is 2019
df_cleaned = df_cleaned[df_cleaned['HATBASLANGICTARIHI'].dt.year != 2019].reset_index(drop=True)

# Further filter: keep only rows where DURAKSIRANO is 43
df_cleaned = df_cleaned[df_cleaned["DURAKSIRANO"] == 43]


In [None]:
# Apply Local Outlier Factor (LOF) to detect outliers based on 'HATSURESI'
lof = LocalOutlierFactor(n_neighbors=20)

# Fit the model and predict: -1 indicates outliers, 1 indicates inliers
y_pred = lof.fit_predict(df_cleaned[['HATSURESI']])

# Extract the rows identified as outliers
outliers_lof = df_cleaned[y_pred == -1]

# Keep only the rows identified as inliers (non-outliers)
df_cleaned= df_cleaned[y_pred != -1]

# Print the number of rows remaining after LOF cleaning
print(f"LOF Cleaned Dataset: {df_cleaned.shape[0]}")


In [None]:
# Sort by route start time and reset index
df_cleaned = df_cleaned.sort_values(by='HATBASLANGICTARIHI').reset_index(drop=True)