# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor

import warnings

# Power predictive score
import ppscore as pps


# Configs

In [None]:
PATH_DATA_TRAIN = "./data/train.csv"

RSEED = 42
DPI = 600


warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv(PATH_DATA_TRAIN)

In [None]:
print(df.head())
print(df.shape)
print(df.isnull().sum())
print(df.dtypes)

In [None]:
print(df["STATUS"].unique())

col_entries = ["ATA", "DEP", "RTR", "SCH", "DEL"]

for year in col_entries:
    print(f"Number of entries of {year}: {df[df['STATUS'] == year].shape[0]}")
    print(f"Mean: {df[df['STATUS'] == year]['target'].mean()}")
    print(f"Median: {df[df['STATUS'] == year]['target'].median()}")

In [None]:
df["STA"] = df["STA"].str.replace(".", ":")

In [None]:
df.head()

In [None]:
df["DATOP"] = pd.to_datetime(df["DATOP"], format="%Y-%m-%d")
df["STD"] = pd.to_datetime(df["STD"], format="%Y-%m-%d %H:%M:%S")
df["STA"] = pd.to_datetime(df["STA"], format="%Y-%m-%d %H:%M:%S")

In [None]:
# extract year, month, dayofweek and hour information out of column publish_time and build new column for each
df["DATOP_year"]=df["DATOP"].dt.year
df["DATOP_month"]=df["DATOP"].dt.month
df["DATOP_day"]=df["DATOP"].dt.dayofweek+1
#df["publish_hour"]=df["publish_time"].dt.hour

In [None]:
df["flight_time"] = (df["STA"] - df["STD"]).dt.total_seconds() / 60

In [None]:
df["STATUS"].unique()

In [163]:
df.head()

# df["DEPSTN"].nunique()
# df["ARRSTN"].nunique()
# pps.matrix(df)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [None]:
pp_scores = pps.matrix(df)[["x", "y", "ppscore"]].pivot(
    columns="x", index="y", values="ppscore"
)
pp_scores = pp_scores.round(2)
plt.figure(figsize=(12, 8))
sns.heatmap(pp_scores, vmin=0, vmax=1, cmap="Reds", linewidths=0.5, annot=True)


plt.savefig("./img/pp-scores.png", dpi=DPI, bbox_inches="tight")

plt.plot()

In [None]:
sns.pairplot(df)

plt.savefig("./img/pairplot.png", dpi=DPI, bbox_inches="tight")

plt.show()

In [None]:
DATOP_years = df["DATOP_year"].unique()

for year in DATOP_years:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == year]
    df_year["DATOP_month"].hist(bins=12)
    plt.title(f"Flight Distribution per Month – {year}")
    plt.xlabel("Month")
    plt.ylabel("Number of Flights")
    plt.xticks(range(1, 13))
    plt.tight_layout()
    plt.show()

In [None]:
for year in DATOP_years:
    plt.figure(figsize=(8, 4))
    df_year = df[df["DATOP_year"] == year]
    df_year.groupby("DATOP_month")["target"].sum().plot(
        kind="line",
        title=f"Monthly Sum of Target for {year}",
        xlabel="Month",
        ylabel="Sum of Target",
    )

# Baseline Model (a.k.a. a Feeble Try)

In [None]:
df_encoded = pd.get_dummies(df, columns=['AC'], prefix='AC')

y = df.target
X = df_encoded.drop("target", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RSEED
)

In [None]:
cols = [col for col in df_encoded.columns if col.startswith("AC_")]

X_0 = X_train[cols]
y_0 = y_train
X_1 = X_test[cols]
y_1 = y_test

# Hyperparameter grid
param_grid = {
    "n_neighbors": [
        3,
        # 5, 7, 9
    ],
    "weights": ["uniform", "distance"],  # Weighting scheme
    "metric": ["euclidean", "manhattan"],  # Distance metric
}

model = KNeighborsRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)

# Fit the grid search to training data
grid_search.fit(X_0, y_0)


# # model = LinearRegression()
# model = KNeighborsRegressor(n_neighbors=500)

# model.fit(X_0, y_0)

y_pred = model.predict(X_1)

mse = mean_squared_error(y_1, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_1, y_pred)

# Display results
# print("Coefficients:", linreg.coef_)
# print("Intercept:", model.intercept_)
print("Root Mean Squared Error:", rmse)
# print("R-squared Score:", r2)

# Data cleaningin and feature engineering

In [None]:
coffee_features.info()

In [None]:
#for this exercise we will only deal with numeric variables

X = coffee_features.select_dtypes(['number'])

## Splitting data for testing 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=RANDOM_SEED)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_train["altitude_mean_log"] = np.log(X_train["altitude_mean_meters"])
X_train.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_train.drop(['Quakers'], axis=1, inplace=True)
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
X_train.info()

In [None]:
altitude_low_meters_mean = X_train["altitude_low_meters"].mean()
altitude_high_meters_mean = X_train["altitude_high_meters"].mean()
altitude_mean_log_mean = X_train["altitude_mean_log"].mean()

In [None]:
# fillna with mean.. 
X_train["altitude_low_meters"] = X_train["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_train["altitude_high_meters"] = X_train["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_train["altitude_mean_log"] = X_train["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
print(f"altitude low meters mean is {altitude_low_meters_mean}")
print(f"altitude_high_meters_mean is {altitude_high_meters_mean}")
print(f"altitude_mean_log_mean is {altitude_mean_log_mean}")

## Trainining the model

In [None]:
## in order to exemplify how the predict will work.. we will save the y_train
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_test["altitude_mean_log"] = np.log(X_test["altitude_mean_meters"])
X_test.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_test.drop(['Quakers'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
# fillna with mean.. 
X_test["altitude_low_meters"] = X_test["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_test["altitude_high_meters"] = X_test["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_test["altitude_mean_log"] = X_test["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
y_test_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print(mse)