###Import Necessary Libraries

In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Reading Dataset

In [66]:
df = pd.read_csv('/content/drive/MyDrive/Train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Test.csv')

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550176 entries, 0 to 550175
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ProductType        550176 non-null  object 
 1   Manufacturer       550176 non-null  object 
 2   Area Code          550176 non-null  object 
 3   Sourcing Channel   550176 non-null  object 
 4   Product Size       550176 non-null  object 
 5   Product Type       550176 non-null  object 
 6   Month of Sourcing  550176 non-null  object 
 7   Sourcing Cost      550176 non-null  float64
dtypes: float64(1), object(7)
memory usage: 33.6+ MB


In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
df.head(1)

Unnamed: 0,ProductType,Manufacturer,Area Code,Sourcing Channel,Product Size,Product Type,Month of Sourcing,Sourcing Cost
0,NTM3,X1,A28,WHOLESALE,Large,Powder,May-21,10.16


In [69]:
df['Month of Sourcing'] = pd.to_datetime(df['Month of Sourcing'], format='%b-%y')


In [None]:
df['Year of Sourcing'] = df['Month of Sourcing'].dt.year
df['Day of Sourcing'] = df['Month of Sourcing'].dt.day
df["Month of Sourcing"] = df["Month of Sourcing"].dt.month


In [None]:
df.head(1)

Unnamed: 0,ProductType,Manufacturer,Area Code,Sourcing Channel,Product Size,Product Type,Month of Sourcing,Sourcing Cost,Year of Sourcing,Day of Sourcing
0,NTM3,X1,A28,WHOLESALE,Large,Powder,5,10.16,2021,1


## Identifying Outliers

In [None]:
from scipy.stats import zscore

def outliers_zscore(data):
    #Identifying outliers using Z-scores
    zs = zscore(data)
    outliers = data[(zs > 3) | (zs < -3)]
    return outliers

def outliers_iqr(data):
    #Identifying outliers using Interquartile Range
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5  IQR
    upper_bound = Q3 + 1.5  IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

def outliers_boxplot(column, data):
    #Visualizing outliers using a boxplot for a specific column in a DataFrame
    plt.figure(figsize=(10, 6))
    plt.boxplot(data[column], vert=False)
    plt.title(f'Boxplot of {column}')
    plt.xlabel('Values')
    plt.show()

SyntaxError: invalid decimal literal (<ipython-input-63-6a48aee715d5>, line 14)

In [None]:
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:  # Modify as needed for different data types
        num_outliers_zscore = len(outliers_zscore(df[column]))
        num_outliers_iqr = len(outliers_iqr(df[column]))
        print(f"Column: {column}")
        print(f"  Number of outliers by Z-score: {num_outliers_zscore}")
        print(f"  Number of outliers by IQR: {num_outliers_iqr}")

In [None]:
for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:
        outliers_boxplot(column, df)


In [None]:
df['Sourcing Cost'].describe()

In [None]:
df_with_outliers = df.copy()

## Removing Outliers

In [None]:
def remove_outliers_trimming(df, column):
    # Will remove all outliers regardless
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5  IQR
    upper_bound = Q3 + 1.5  IQR

    # Condition for non-outliers
    condition = (df[column] >= lower_bound) & (df[column] <= upper_bound)
    return df[condition]

In [None]:
def remove_outliers_capping(df, column):
    # Will set them to nearest values within acceptable rage of IQR
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5  IQR
    upper_bound = Q3 + 1.5  IQR

    df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
    return df

In [None]:
df_capped = remove_outliers_capping(df, 'Sourcing Cost')
df_capped.info()

In [None]:
def bin_data(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5  IQR
    upper_bound = Q3 + 1.5  IQR

    bins = [df[column].min(), lower_bound, Q1, Q3, upper_bound, df[column].max()]
    labels = ['Extreme Low', 'Low', 'Moderate', 'High', 'Extreme High']
    df['binned'] = pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)
    return df

def remove_extreme_bins(df, column_bin):
    # Filter out rows where the bin label is 'Extreme High' or 'Extreme Low'
    condition = ~df[column_bin].isin(['Extreme High', 'Extreme Low'])
    return df[condition]

In [None]:
df = df_capped.copy()

###Used Capping Instead of Binning as it is more suitable for given finincial data




In [None]:
columns_with_null = df.columns[df.isnull().any()].tolist()
columns_with_null

## Feature Selection and EDA

In [None]:
categorical_cols = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type']

In [None]:
from sklearn.preprocessing import LabelEncoder
label_mappings = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
# Now label_mappings contains the mappings of original categorical values to encoded numerical values


In [None]:
df.head(1)

In [None]:
import seaborn as sns

def plot_correlation_matrix(df):
    corr_matrix = df.corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()
plot_correlation_matrix(df)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

def select_features_kbest(X, y, k):
    # Select top k features based on the F-score
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)
    scores = pd.DataFrame({
        'Feature': X.columns,
        'Score': selector.scores_
    }).sort_values(by='Score', ascending=False)
    return scores

X = df.drop('Sourcing Cost', axis=1)
y = df['Sourcing Cost']
top_features = select_features_kbest(X, y, k=5)
print(top_features)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top_features, x='Score', y='Feature', orient='h')
plt.title('Feature Importance Scores')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()


We infer that, to determin Sourcing Cost
1. Manufacturer is the strongest feature followed by Sourcing Channel
2. Year and month of sourcing ans 'ProductType' are also strong
3. Area code, Product size, 'Product Type' are realtively weak
4. Day of sourcing is the weakest

###Graphs

In [None]:
df_capped.info()

In [None]:
# Function to plot a histogram for a numerical column
def plot_histogram(data, column, bins=30):
    plt.figure(figsize=(10, 6))
    sns.histplot(data[column], bins=bins, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# To plot a count plot for a categorical column
def plot_count(data, column):
    plt.figure(figsize=(12, 8))
    sns.countplot(data=data, y=column, order = data[column].value_counts().index)
    plt.title(f'Distribution of {column}')
    plt.xlabel('Count')
    plt.ylabel(column)
    plt.show()
# Function to plot a time series plot for a dataset with time and value columns
def plot_time_series(data, time_column, value_column):
    plt.figure(figsize=(14, 7))
    sns.lineplot(data=data, x=time_column, y=value_column, marker='o')
    plt.title(f'Time Series Plot of {value_column} Over {time_column}')
    plt.xlabel(time_column)
    plt.ylabel(value_column)
    plt.show()

In [None]:
plot_histogram(df_capped, 'Sourcing Cost')

In [None]:
# Plot count of each category in different columns
plot_count(df_capped, 'ProductType')
plot_count(df_capped, 'Manufacturer')
plot_count(df_capped, 'Area Code')
plot_count(df_capped, 'Sourcing Channel')
plot_count(df_capped, 'Product Size')
plot_count(df_capped, 'Product Type')

In [None]:
#Plot time series graph
plot_time_series(df_capped, 'Month of Sourcing', 'Sourcing Cost')
plot_time_series(df_capped, 'Year of Sourcing', 'Sourcing Cost')

In [None]:
df_test.head(5)

In [None]:
#Plots line plot for Sourcing cost
from matplotlib import pyplot as plt
df_test['Sourcing Cost'].plot(kind='line', figsize=(8, 4), title='Sourcing Cost')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
#group the data by 'ProductType', calculate the size of each group, and plots the result as a horizontal bar plot
from matplotlib import pyplot as plt
import seaborn as sns
df_test.groupby('ProductType').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#Histogram for Sourcing Cost
from matplotlib import pyplot as plt
df_test['Sourcing Cost'].plot(kind='hist', bins=20, title='Sourcing Cost')
plt.gca().spines[['top', 'right',]].set_visible(False)

#Modelling


###Random Forest Regressor along with Label encoding technique

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv('/content/drive/MyDrive/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Test.csv')

# Data preprocessing
encoder = LabelEncoder()
train_data['Month of Sourcing'] = pd.to_datetime(train_data['Month of Sourcing'], format='%b-%y')
train_data['Month'] = train_data['Month of Sourcing'].dt.month
train_data['Year'] = train_data['Month of Sourcing'].dt.year
train_data['ProductType'] = encoder.fit_transform(train_data['ProductType'])
train_data['Manufacturer'] = encoder.fit_transform(train_data['Manufacturer'])
train_data['Area Code'] = encoder.fit_transform(train_data['Area Code'])
train_data['Sourcing Channel'] = encoder.fit_transform(train_data['Sourcing Channel'])
train_data['Product Size'] = encoder.fit_transform(train_data['Product Size'])
train_data['Product Type'] = encoder.fit_transform(train_data['Product Type'])

test_data['Month of Sourcing'] = pd.to_datetime(test_data['Month of Sourcing'], format='%b-%y')
test_data['Month'] = test_data['Month of Sourcing'].dt.month
test_data['Year'] = test_data['Month of Sourcing'].dt.year
test_data['ProductType'] = encoder.fit_transform(test_data['ProductType'])
test_data['Manufacturer'] = encoder.fit_transform(test_data['Manufacturer'])
test_data['Area Code'] = encoder.fit_transform(test_data['Area Code'])
test_data['Sourcing Channel'] = encoder.fit_transform(test_data['Sourcing Channel'])
test_data['Product Size'] = encoder.fit_transform(test_data['Product Size'])
test_data['Product Type'] = encoder.fit_transform(test_data['Product Type'])

# Defining features and target variable
features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month', 'Year']
target = 'Sourcing Cost'

# Split the data
X_train = train_data[features]
y_train = train_data[target]

# Random Forest Regressor model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

X_test = test_data[features]

# Predictions on test data
y_pred = model_rf.predict(X_test)

test_data['Predicted Sourcing Cost'] = y_pred

print(test_data)


###Applied K fold cross validation technique but was not efficcient

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score

# Load the data
train_data = pd.read_csv('/content/drive/MyDrive/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Test.csv')

# Data preprocessing
encoder = LabelEncoder()

def preprocess_data(data):
    data['Month of Sourcing'] = pd.to_datetime(data['Month of Sourcing'], format='%b-%y')
    data['Month'] = data['Month of Sourcing'].dt.month
    data['Year'] = data['Month of Sourcing'].dt.year
    data['ProductType'] = encoder.fit_transform(data['ProductType'])
    data['Manufacturer'] = encoder.fit_transform(data['Manufacturer'])
    data['Area Code'] = encoder.fit_transform(data['Area Code'])
    data['Sourcing Channel'] = encoder.fit_transform(data['Sourcing Channel'])
    data['Product Size'] = encoder.fit_transform(data['Product Size'])
    data['Product Type'] = encoder.fit_transform(data['Product Type'])
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Defining features and target variable
features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month', 'Year']
target = 'Sourcing Cost'

# Split the data
X_train = train_data[features]
y_train = train_data[target]

# Random Forest Regressor model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# K-fold Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model_rf, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Fit the model
model_rf.fit(X_train, y_train)

# Predictions on test data
X_test = test_data[features]
y_pred = model_rf.predict(X_test)

test_data['Predicted Sourcing Cost'] = y_pred

print("Cross-validation MSE scores:", -cv_scores)
print("Mean CV MSE:", -cv_scores.mean())
print(test_data)


In [None]:
#Print acatual vs Predicted for Random Forest Regressor
# Extract actual values from the test dataset
y_actual = test_data['Sourcing Cost']

# Calculate evaluation metrics
mse = mean_squared_error(y_actual, y_pred)
mae = mean_absolute_error(y_actual, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_actual, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

comparison_df = pd.DataFrame({'Actual': y_actual, 'Predicted': y_pred})
print(comparison_df.head())


 GradientBoostingRegressor, ExtraTreesRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

# Initialization
models = {
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Extra Trees Regressor': ExtraTreesRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_actual, y_pred)
    mae = mean_absolute_error(y_actual, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_actual, y_pred)

    print(f"Model: {name}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2) Score: {r2}")
    print()

    # Saving the model to compare later
    test_data[f'Predicted Sourcing Cost - {name}'] = y_pred


SVR

In [None]:
from sklearn.svm import SVR

# Initialize SVR model
model_svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Train SVR model
model_svr.fit(X_train, y_train)

# Predict using SVR model
y_pred_svr = model_svr.predict(X_test)

# Evaluate SVR model
mse_svr = mean_squared_error(y_actual, y_pred_svr)
mae_svr = mean_absolute_error(y_actual, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = r2_score(y_actual, y_pred_svr)

print("Support Vector Regressor (SVR) Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse_svr}")
print(f"Mean Absolute Error (MAE): {mae_svr}")
print(f"Root Mean Squared Error (RMSE): {rmse_svr}")
print(f"R-squared (R2) Score: {r2_svr}")

# Save predicted values for comparison
test_data['Predicted Sourcing Cost - SVR'] = y_pred_svr


In [None]:
pip install catboost

###Other Regression models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Initialization
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'LightGBM Regressor': LGBMRegressor(),
    'CatBoost Regressor': CatBoostRegressor(verbose=0)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_actual, y_pred)
    mae = mean_absolute_error(y_actual, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_actual, y_pred)

    print(f"Model: {name}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2) Score: {r2}")

    # Prints actual vs. predicted values
    print("Actual vs. Predicted:")
    comparison_df = pd.DataFrame({'Actual': y_actual, 'Predicted': y_pred})
    print(comparison_df.head())

    print('\n' + '-'50 + '\n')


XGBoost stands out for its effectiveness due to several key features:

Gradient Boosting Technique: Sequential model building corrects errors, reducing bias and variance for better generalization.

Regularization: L1/L2 regularization prevents overfitting, promoting simpler, more generalizable models.

Tree Pruning: Eliminates non-contributing splits, fostering simpler, interpretable trees.

Handling Missing Values: Built-in handling reduces reliance on imputation, preserving data integrity.

Feature Importance: Provides insights for informed feature selection and engineering, enhancing model accuracy.

Parallel Processing: Efficiently scales for large datasets, leveraging CPU cores for faster training.

Optimized Implementation: C++ implementation ensures speed and memory efficiency, reducing training times.

Tuning Flexibility: Wide hyperparameter range allows tailored optimization for diverse datasets.

Ensemble Approach: Combines weak learners for robust, accurate predictions.

Competitive Success: Proven track record in ML competitions underscores its versatility and effectiveness.

#Deep Learning Models

###LSTM

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Initialize LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(units=1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train LSTM model
lstm_model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0)

# Evaluate LSTM model on the test set
y_pred_lstm = lstm_model.predict(X_test)

# Calculate evaluation metrics
mse_lstm = mean_squared_error(y_actual, y_pred_lstm)
mae_lstm = mean_absolute_error(y_actual, y_pred_lstm)
rmse_lstm = np.sqrt(mse_lstm)
r2_lstm = r2_score(y_actual, y_pred_lstm)

# Print evaluation metrics
print("LSTM Model Metrics:")
print(f"Mean Squared Error (MSE): {mse_lstm}")
print(f"Mean Absolute Error (MAE): {mae_lstm}")
print(f"Root Mean Squared Error (RMSE): {rmse_lstm}")
print(f"R-squared (R2) Score: {r2_lstm}\n")
test_data['Predicted Sourcing Cost - LSTM'] = y_pred_lstm


###Sequential Model

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# Load the data
train_data = pd.read_csv('/content/drive/MyDrive/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Test.csv')

# Data preprocessing
encoder = LabelEncoder()
scaler = MinMaxScaler()

def preprocess_data(data):
    data['Month of Sourcing'] = pd.to_datetime(data['Month of Sourcing'], format='%b-%y')
    data['Month'] = data['Month of Sourcing'].dt.month
    data['Year'] = data['Month of Sourcing'].dt.year
    data['ProductType'] = encoder.fit_transform(data['ProductType'])
    data['Manufacturer'] = encoder.fit_transform(data['Manufacturer'])
    data['Area Code'] = encoder.fit_transform(data['Area Code'])
    data['Sourcing Channel'] = encoder.fit_transform(data['Sourcing Channel'])
    data['Product Size'] = encoder.fit_transform(data['Product Size'])
    data['Product Type'] = encoder.fit_transform(data['Product Type'])
    data['Sourcing Cost'] = scaler.fit_transform(data[['Sourcing Cost']])
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Defining features and target variable
features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month', 'Year']
target = 'Sourcing Cost'

# Split the data
X = train_data[features]
y = train_data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[len(features)]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4, batch_size=32, verbose=1)

# Predictions on test data
X_test = test_data[features]
y_pred_scaled = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled).flatten()  # Inverse transform to get actual values

# Get the actual values for the test data
y_actual = scaler.inverse_transform(test_data[[target]]).flatten()

# Print actual vs predicted
print("Actual vs Predicted:")
comparison_df = pd.DataFrame({'Actual': y_actual, 'Predicted': y_pred})
print(comparison_df.head())


In [None]:
from sklearn.preprocessing import  StandardScaler
# Load the data
train_data = pd.read_csv('/content/drive/MyDrive/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Test.csv')

# Data preprocessing
encoder = LabelEncoder()
scaler = StandardScaler()

def preprocess_data(data):
    data['Month of Sourcing'] = pd.to_datetime(data['Month of Sourcing'], format='%b-%y')
    data['Month'] = data['Month of Sourcing'].dt.month
    data['Year'] = data['Month of Sourcing'].dt.year
    data['ProductType'] = encoder.fit_transform(data['ProductType'])
    data['Manufacturer'] = encoder.fit_transform(data['Manufacturer'])
    data['Area Code'] = encoder.fit_transform(data['Area Code'])
    data['Sourcing Channel'] = encoder.fit_transform(data['Sourcing Channel'])
    data['Product Size'] = encoder.fit_transform(data['Product Size'])
    data['Product Type'] = encoder.fit_transform(data['Product Type'])
    data['Sourcing Cost'] = scaler.fit_transform(data[['Sourcing Cost']])
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Defining features and target variable
features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month', 'Year']
target = 'Sourcing Cost'

# Split the data
X = train_data[features]
y = train_data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the neural network model
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[len(features)]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4, batch_size=32, verbose=1)

# Predictions on test data
X_test = test_data[features]
y_pred_scaled = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled).flatten()  # Inverse transform to get actual values

# Get the actual values for the test data
y_actual = scaler.inverse_transform(test_data[[target]]).flatten()

# Print actual vs predicted
print("Actual vs Predicted:")
comparison_df = pd.DataFrame({'Actual': y_actual, 'Predicted': y_pred})
print(comparison_df.head())

# Multilayer Perceptron (MLP) Model
mlp_model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=[len(features)]),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])
mlp_model.compile(optimizer='adam', loss='mean_squared_error')
mlp_history = mlp_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4, batch_size=32, verbose=1)
mlp_y_pred_scaled = mlp_model.predict(X_test)
mlp_y_pred = scaler.inverse_transform(mlp_y_pred_scaled).flatten()
print("Actual vs Predicted (MLP):")
mlp_comparison_df = pd.DataFrame({'Actual': y_actual, 'Predicted': mlp_y_pred})
print(mlp_comparison_df.head())


1. Deep learning models underperform due to requirements of large datasets for effective generalization and complexities in hyperparameter tuning, feature engineering, and managing overfitting, leading to longer training times and computational demands.

2. Lack of interpretability and challenges in handling limited data diversity further hinder deep learning model performance, necessitating careful consideration of suitability for specific tasks and availability of resources for training and optimization.

##Based on the evaluation metrics and comparison of different regression models including Random Forest, Gradient Boosting, Extra Trees, and Support Vector Regression (SVR), it can be concluded that XGBoost (Extreme Gradient Boosting) performs the best for the given task of predicting sourcing costs. This conclusion is drawn from several factors:

1. Performance Metrics: XGBoost consistently shows lower Mean Squared Error (MSE), Mean Absolute Error (MAE), and Root Mean Squared Error (RMSE) compared to other models, indicating better accuracy in predicting sourcing costs.

2. R-squared (R2) Score: XGBoost also demonstrates higher R-squared (R2) scores, suggesting better overall goodness of fit and explaining more variance in the data compared to alternative models.

3. Robustness: XGBoost's robustness is evidenced by its performance across multiple evaluation metrics and its ability to handle complex relationships and non-linear patterns in the data.

4. Consistency: XGBoost consistently outperforms other models across different datasets and scenarios, making it a reliable choice for predictive modeling tasks.

5. Ensemble Learning: XGBoost's ensemble approach, combining predictions from multiple weak learners (decision trees), helps in reducing bias and variance, leading to more accurate and stable predictions.

In conclusion, based on the evaluation results and considerations of accuracy, robustness, and consistency, XGBoost emerges as the best model for predicting sourcing costs.