## 1. Understand the Domain and Problem Statement

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("clean_dataset.csv")

# Shape of the dataset
print("Dataset Shape:", df.shape)

# Display the first few rows
print("Sample Records:")
print(df.head())

# Summary of data types and missing values
print("\nData Types and Missing Values:")
print(df.info())

# Basic statistics for numeric columns
print("\nStatistical Summary (Numerical Columns):")
print(df.describe())

# Unique values in each column (for categorical understanding)
print("\nUnique Values in Categorical Columns:")
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"{col}: {df[col].nunique()} unique values")

# Preview unique values for date columns if present
print("\nDatetime Columns:")
for col in df.columns:
    if pd.api.types.is_datetime64_any_dtype(df[col]) or 'date' in col.lower() or 'time' in col.lower():
        print(f"{col}:")
        print(df[col].head())


Dataset Shape: (140909, 24)
Sample Records:
       data          trip_creation_time  \
0  training  2018-09-20 02:35:36.476840   
1  training  2018-09-20 02:35:36.476840   
2  training  2018-09-20 02:35:36.476840   
3  training  2018-09-20 02:35:36.476840   
4  training  2018-09-20 02:35:36.476840   

                                 route_schedule_uuid route_type  \
0  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
1  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
2  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
3  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
4  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   

                 trip_uuid source_center                    source_name  \
0  trip-153741093647649320  IND388121AAA     Anand_VUNagar_DC (Gujarat)   
1  trip-153741093647649320  IND388121AAA     Anand_VUNagar_DC (Gujarat)   
2  trip-153741093647649320  IND388121AAA     Anand_VUNagar_DC (Gujara

## 2. Feature Selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LassoCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("clean_dataset.csv")

# Target for classification
target = 'is_cutoff'

# Drop ID/time columns (to be engineered separately if needed)
drop_cols = [
    'trip_uuid', 'route_schedule_uuid', 'trip_creation_time',
    'od_start_time', 'od_end_time', 'cutoff_timestamp',
    'source_name', 'destination_name'
]
df = df.drop(columns=drop_cols)

# Encode categoricals
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Prepare features
X = df.drop(columns=[target])
y = df[target]

# 1. Correlation matrix to detect multicollinearity
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print("Highly correlated features to drop:", high_corr_features)
X = X.drop(columns=high_corr_features)

# 2. VIF check
scaler = StandardScaler ()
X_scaled = scaler.fit_transform(X)
vif_df = pd.DataFrame()
vif_df["feature"] = X.columns
vif_df["VIF"] = [variance_inflation_factor(X_scaled, i) for i in range(X_scaled.shape[1])]
print("\nVIF Scores:")
print(vif_df.sort_values(by="VIF", ascending=False))

# Optional: Drop features with VIF > 10 (multicollinearity)
X = X.loc[:, vif_df[vif_df["VIF"] < 10]["feature"]]

# 3. Feature Importance - Mutual Information (for classification)
mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print("\nTop features by Mutual Information:")
print(mi_series)

# 4. Feature Importance - Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X, y)
rf_importances = pd.Series(model_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop features by Random Forest:")
print(rf_importances)

# Optional 5. LassoCV for regression-based selection (if using regression target like 'actual_time')
# from sklearn.linear_model import LassoCV
# lasso_model = LassoCV(cv=5).fit(X, df['actual_time'])
# lasso_coef = pd.Series(lasso_model.coef_, index=X.columns)
# print("\nNon-zero Lasso Coefficients:")
# print(lasso_coef[lasso_coef != 0].sort_values(ascending=False))


Highly correlated features to drop: ['actual_distance_to_destination', 'actual_time', 'osrm_time', 'osrm_distance', 'segment_osrm_distance']

VIF Scores:
                  feature       VIF
4  start_scan_to_end_scan  2.914135
5           cutoff_factor  2.606873
7     segment_actual_time  2.200595
6                  factor  1.713115
8       segment_osrm_time  1.653010
9          segment_factor  1.638216
1              route_type  1.491649
2           source_center  1.204978
3      destination_center  1.172240
0                    data  1.003509

Top features by Mutual Information:
cutoff_factor             0.352854
segment_osrm_time         0.135933
segment_actual_time       0.091274
segment_factor            0.081164
start_scan_to_end_scan    0.060538
destination_center        0.059472
source_center             0.057026
route_type                0.036407
factor                    0.032138
data                      0.021833
dtype: float64

Top features by Random Forest:
cutoff_factor   

## 3. Create New Features

In [4]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("clean_dataset.csv")

# --- Step 1: Parse datetime fields ---
datetime_cols = ['trip_creation_time', 'od_start_time', 'od_end_time', 'cutoff_timestamp']
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col])

# Extract time-based features
df['trip_creation_hour'] = df['trip_creation_time'].dt.hour
df['trip_creation_day'] = df['trip_creation_time'].dt.day
df['trip_creation_weekday'] = df['trip_creation_time'].dt.weekday
df['od_start_hour'] = df['od_start_time'].dt.hour
df['od_end_hour'] = df['od_end_time'].dt.hour
df['cutoff_hour'] = df['cutoff_timestamp'].dt.hour

# Trip duration features
df['planned_duration'] = (df['od_end_time'] - df['od_start_time']).dt.total_seconds() / 60
df['creation_to_start_mins'] = (df['od_start_time'] - df['trip_creation_time']).dt.total_seconds() / 60
df['start_to_cutoff_mins'] = (df['cutoff_timestamp'] - df['od_start_time']).dt.total_seconds() / 60

# --- Step 2: Mathematical feature engineering ---
# Ratios
df['actual_vs_osrm_time'] = df['actual_time'] / df['osrm_time']
df['segment_actual_vs_osrm_time'] = df['segment_actual_time'] / df['segment_osrm_time']
df['distance_per_min'] = df['actual_distance_to_destination'] / df['actual_time']

# Differences
df['time_difference'] = df['actual_time'] - df['osrm_time']
df['segment_time_diff'] = df['segment_actual_time'] - df['segment_osrm_time']

# --- Step 3: Aggregations (example) ---
# Trip counts per route (if route info is retained)
# df['route_trip_count'] = df.groupby('route_schedule_uuid')['trip_uuid'].transform('count')

# Trip counts per center pair (origin-destination)
df['center_pair'] = df['source_center'] + "_" + df['destination_center']
df['center_pair_count'] = df.groupby('center_pair')['trip_uuid'].transform('count')

# --- Step 4: Domain-specific features (logistics) ---
# High-delay indicator
df['is_heavy_delay'] = (df['actual_vs_osrm_time'] > 1.5).astype(int)

# Delay bucket (optional)
df['delay_category'] = pd.cut(df['actual_vs_osrm_time'], bins=[0, 1, 1.25, 1.5, np.inf],
                              labels=['on_time', 'slight_delay', 'delay', 'heavy_delay'])

# --- Drop helper columns if not needed ---
df = df.drop(columns=['center_pair'])

# Save to file if needed


# Save the final feature-engineered dataset
df.to_csv("feature_engineered_dataset.csv", index=False)



## 4. Feature Transformation

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, QuantileTransformer,
    FunctionTransformer, PolynomialFeatures
)
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv("feature_engineered_dataset.csv")

# Drop non-numeric and ID/date columns
exclude_cols = ['trip_uuid', 'route_schedule_uuid', 'trip_creation_time',
                'od_start_time', 'od_end_time', 'cutoff_timestamp', 
                'source_center', 'destination_center', 'source_name', 
                'destination_name', 'delay_category']
features = df.drop(columns=[col for col in exclude_cols if col in df.columns])

# Separate target
if 'is_cutoff' in features.columns:
    target = features['is_cutoff']
    features = features.drop(columns=['is_cutoff'])
else:
    target = None

# Replace inf/-inf with NaN and drop them
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)

# Drop target rows to match
if target is not None:
    target = target.loc[features.index]

# Select numeric columns
num_cols = features.select_dtypes(include=[np.number]).columns.tolist()

# Filter log-safe columns (positive only and no zero)
log_cols = [col for col in num_cols if (features[col] > 0).all()]

# Apply transformations (individually to avoid conflicts)
transformed_dfs = {}

# 1. MinMax Scaling
minmax_scaled = MinMaxScaler().fit_transform(features[num_cols])
transformed_dfs["minmax"] = pd.DataFrame(minmax_scaled, columns=[f"{col}_minmax" for col in num_cols])

# 2. Standard Scaling
standard_scaled = StandardScaler().fit_transform(features[num_cols])
transformed_dfs["standard"] = pd.DataFrame(standard_scaled, columns=[f"{col}_std" for col in num_cols])

# 3. Log Transform (safe columns only)
log_transformed = np.log1p(features[log_cols])
transformed_dfs["log"] = log_transformed.rename(columns=lambda c: f"{c}_log")

# 4. Quantile Transformer
quantile_scaled = QuantileTransformer(output_distribution='normal').fit_transform(features[num_cols])
transformed_dfs["quantile"] = pd.DataFrame(quantile_scaled, columns=[f"{col}_qt" for col in num_cols])

# 5. Polynomial Features (up to degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(features[num_cols])
poly_feature_names = poly.get_feature_names_out(num_cols)
transformed_dfs["poly"] = pd.DataFrame(poly_features, columns=[f"{name}_poly" for name in poly_feature_names])

# Merge all
df_transformed = pd.concat(transformed_dfs.values(), axis=1)

# Add target back if available
if target is not None:
    df_transformed['is_cutoff'] = target.reset_index(drop=True)

# Save
df_transformed.to_csv("transformed_features_dataset.csv", index=False)


## 5. Feature Scaling

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
)

# Load the feature-engineered dataset
df = pd.read_csv("feature_engineered_dataset.csv")

# Drop non-numeric, ID, and datetime columns
exclude_cols = ['trip_uuid', 'route_schedule_uuid', 'trip_creation_time',
                'od_start_time', 'od_end_time', 'cutoff_timestamp', 
                'source_center', 'destination_center', 'source_name', 
                'destination_name', 'delay_category']

features = df.drop(columns=[col for col in exclude_cols if col in df.columns], errors='ignore')

# Separate target
if 'is_cutoff' in features.columns:
    target = features['is_cutoff']
    features = features.drop(columns=['is_cutoff'])
else:
    target = None

# Replace infinite values and drop NaNs
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
if target is not None:
    target = target.loc[features.index]

# Select only numeric columns
num_cols = features.select_dtypes(include=[np.number]).columns.tolist()

# Initialize scalers
scalers = {
    "minmax": MinMaxScaler(),
    "standard": StandardScaler(),
    "robust": RobustScaler(),
    "maxabs": MaxAbsScaler()
}

# Apply each scaler and store transformed DataFrames
scaled_dfs = {}
for name, scaler in scalers.items():
    scaled = scaler.fit_transform(features[num_cols])
    scaled_dfs[name] = pd.DataFrame(scaled, columns=[f"{col}_{name}" for col in num_cols])

# Merge all scaled features
scaled_all = pd.concat(scaled_dfs.values(), axis=1)

# Add target back if present
if target is not None:
    scaled_all['is_cutoff'] = target.reset_index(drop=True)

# Save the fully scaled dataset
scaled_all.to_csv("scaled_features_dataset.csv", index=False)



## 6. Feature Reduction

In [4]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Config
DATA_PATH = "scaled_features_dataset.csv"
TARGET_COL = 'is_cutoff'
SAMPLE_ROWS = None  # Set to e.g., 10000 for quick testing

# Load data
df = pd.read_csv(DATA_PATH, usecols=lambda c: c == TARGET_COL or "_std" in c or "_minmax" in c or "_robust" in c or "_maxabs" in c)
if SAMPLE_ROWS:
    df = df.sample(n=SAMPLE_ROWS, random_state=42)

# Split features and target
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

# Impute missing values
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)

# 1. PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)
print("✅ PCA reduced feature count:", X_pca.shape[1])

# 2. ANOVA F-test (top 30)
anova_features = X.columns[SelectKBest(score_func=f_classif, k=30).fit(X, y).get_support()]
print("✅ ANOVA top 30:", list(anova_features))

# 3. Chi-Square (requires non-negative input)
X_chi = MinMaxScaler().fit_transform(X)
chi_features = X.columns[SelectKBest(score_func=chi2, k=30).fit(X_chi, y).get_support()]
print("✅ Chi-Square top 30:", list(chi_features))

# 4. LassoCV (embedded method)
lasso = LassoCV(cv=3, max_iter=10000, n_jobs=-1, random_state=42).fit(X, y)
lasso_features = X.columns[lasso.coef_ != 0]
print("✅ Lasso-selected:", list(lasso_features))

# 5. Recursive Feature Elimination
rfe_model = LogisticRegression(max_iter=500, solver='liblinear')
rfe = RFE(rfe_model, n_features_to_select=30, step=10)
rfe_features = X.columns[rfe.fit(X, y).support_]
print("✅ RFE top 30:", list(rfe_features))

# Optional: Save PCA output
pca_df = pd.DataFrame(X_pca, columns=[f'pca_{i+1}' for i in range(X_pca.shape[1])])
pca_df[TARGET_COL] = y.reset_index(drop=True)
pca_df.to_csv("pca_reduced_dataset.csv", index=False)


✅ PCA reduced feature count: 7
✅ ANOVA top 30: ['start_scan_to_end_scan_minmax', 'cutoff_factor_minmax', 'actual_distance_to_destination_minmax', 'osrm_distance_minmax', 'segment_osrm_time_minmax', 'segment_osrm_distance_minmax', 'planned_duration_minmax', 'start_to_cutoff_mins_minmax', 'distance_per_min_minmax', 'center_pair_count_minmax', 'start_scan_to_end_scan_robust', 'cutoff_factor_robust', 'actual_distance_to_destination_robust', 'osrm_distance_robust', 'segment_osrm_time_robust', 'segment_osrm_distance_robust', 'planned_duration_robust', 'start_to_cutoff_mins_robust', 'distance_per_min_robust', 'center_pair_count_robust', 'start_scan_to_end_scan_maxabs', 'cutoff_factor_maxabs', 'actual_distance_to_destination_maxabs', 'osrm_distance_maxabs', 'segment_osrm_time_maxabs', 'segment_osrm_distance_maxabs', 'planned_duration_maxabs', 'start_to_cutoff_mins_maxabs', 'distance_per_min_maxabs', 'center_pair_count_maxabs']
✅ Chi-Square top 30: ['start_scan_to_end_scan_minmax', 'cutoff_fact