In [99]:
import numpy as np
import pandas as pd
import sklearn as sl
import matplotlib as mp
import scipy as cp
import seaborn as sb
from matplotlib import pyplot as plt
from math import sqrt
import warnings

import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.sandbox.regression.gmm import IV2SLS 
from statsmodels.sandbox.regression.gmm import GMM

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score, recall_score, precision_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, confusion_matrix, classification_report
from sklearn import linear_model as lm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from scipy.linalg import toeplitz

In [100]:
df = pd.read_csv(r"C:\Users\johns\OneDrive\Desktop\MBAN Semester 2\Predictive Analytics\Hackathon\Hackathon\Sample_Dataset.csv")

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Order Date               95 non-null     object
 1   Requested Delivery Date  95 non-null     object
 2   Customer Country Code    95 non-null     object
 3   Product Code             95 non-null     object
 4   Description              95 non-null     object
 5   Order Type               95 non-null     object
 6   Customer Order Code      95 non-null     object
 7   Value                    95 non-null     int64 
 8   Currency                 95 non-null     object
 9   Items                    95 non-null     int64 
 10  Route                    95 non-null     object
dtypes: int64(2), object(9)
memory usage: 8.3+ KB


In [102]:
df.head()

Unnamed: 0,Order Date,Requested Delivery Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route
0,01.01.2009,10.01.2009,US,PROD001,Winter Parka,VO,ORD001,500,USD,10,R1
1,02.02.2009,15.02.2009,CA,PROD002,Ski Jacket,VO,ORD002,800,CAD,5,R2
2,03.03.2009,12.03.2009,UK,PROD003,Snow Explorer,VO,ORD003,1200,GBP,7,R1
3,04.04.2009,18.04.2009,FR,PROD001,Winter Parka,VO,ORD004,600,EUR,9,R3
4,05.05.2009,20.05.2009,DE,PROD002,Ski Jacket,VO,ORD005,900,EUR,4,R2


In [103]:
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d.%m.%Y')
df['Requested Delivery Date'] = pd.to_datetime(df['Requested Delivery Date'], format='%d.%m.%Y')
df['Lead Time'] = (df['Requested Delivery Date'] - df['Order Date']).dt.days

In [104]:
y = df['Items']
X = df[['Lead Time', 'Customer Country Code', 'Product Code', 'Description', 'Value', 'Currency', 'Route']]

In [105]:
numeric_features = ['Lead Time', 'Value']
categorical_features = ['Customer Country Code', 'Product Code', 'Description', 'Currency', 'Route']

In [106]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [107]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [108]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [109]:
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

In [110]:
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [112]:
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
lasso_param_grid = {'regressor__alpha': alphas}
ridge_param_grid = {'regressor__alpha': alphas}

lasso_grid = GridSearchCV(lasso_model, lasso_param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid = GridSearchCV(ridge_model, ridge_param_grid, cv=5, scoring='neg_mean_squared_error')

lasso_grid.fit(X_train, y_train)
ridge_grid.fit(X_train, y_train)

In [113]:
print("Best Lasso Hyperparameters:", lasso_grid.best_params_)
print("Best Ridge Hyperparameters:", ridge_grid.best_params_)

Best Lasso Hyperparameters: {'regressor__alpha': 0.01}
Best Ridge Hyperparameters: {'regressor__alpha': 1}


In [114]:
lasso_predictions = lasso_grid.predict(X_test)
ridge_predictions = ridge_grid.predict(X_test)

lasso_mae = mean_absolute_error(y_test, lasso_predictions)
ridge_mae = mean_absolute_error(y_test, ridge_predictions)

print(f'Lasso Mean Absolute Error: {lasso_mae}')
print(f'Ridge Mean Absolute Error: {ridge_mae}')

Lasso Mean Absolute Error: 2.314310326878284
Ridge Mean Absolute Error: 2.2911281918844977
