In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

def calculate_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def calculate_cod(y_true, y_pred):
    y_mean = np.mean(y_true)
    tss = np.sum((y_true - y_mean) ** 2)
    rss = np.sum((y_true - y_pred) ** 2)
    r_squared = 1 - (rss / tss)
    return r_squared

data = pd.read_excel('C:\\Users\\Lenovo\\Desktop\\DS Internship\\Data\\Modelling Regression\\DS Internship - Modeling - Data.xlsx')
features = data.drop(columns=['Store ID', 'Sales', 'Centre Type', 'Climate'])
target_variable = 'Sales'
X = features
y = data[target_variable]

print(X.shape)  
print(y.shape)  



(691, 33)
(691,)


In [24]:
data_encoded = pd.get_dummies(data, columns=['Pop class', 'Centre Type'])
features = data_encoded.drop(columns=['Store ID', 'Sales', 'Climate'])
target_variable = 'Sales'
X = features
y = data_encoded[target_variable]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
data = data.dropna()
categorical_cols = ['Pop class', 'Centre Type', 'Climate', 'Guys Segmentation', 'Girls Segmentation', 'ChangeMade', 'ChangeDate']
data_encoded = pd.get_dummies(data, columns=categorical_cols)
features = data_encoded.drop(columns=['Store ID', 'Sales'])
target_variable = 'Sales'
X = features
y = data_encoded[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R squared: {r2}")

coefficients = model.coef_
feature_names = X.columns
coef_df = pd.DataFrame({'Feature': feature_names, 'Coeff': coefficients})
coef_df['Abs_Coeff'] = np.abs(coef_df['Coeff'])
coef_df = coef_df.sort_values(by='Abs_Coeff', ascending=False)
print("\nKey Drivers:")
print(coef_df)

Mean Absolute Error: 463560.367053044
Mean Squared Error: 337926202408.6728
Root Mean Squared Error: 581314.2028272428
R squared: -0.3644601243805452

Key Drivers:
                            Feature         Coeff     Abs_Coeff
120  ChangeDate_2015-09-17 00:00:00 -1.226955e+06  1.226955e+06
199             ChangeDate_00:00:00  1.009694e+06  1.009694e+06
56   ChangeDate_2008-11-17 00:00:00  9.918119e+05  9.918119e+05
142  ChangeDate_2016-03-03 00:00:00 -9.575769e+05  9.575769e+05
195  ChangeDate_2019-04-04 00:00:00  9.249128e+05  9.249128e+05
..                              ...           ...           ...
70   ChangeDate_2011-02-25 00:00:00 -2.403443e-01  2.403443e-01
12    Average Household Income CrYr -1.055762e-01  1.055762e-01
17        Wealth: Average Household  1.007191e-03  1.007191e-03
15    Family Inc: Aggregate Average -4.333507e-05  4.333507e-05
192  ChangeDate_2018-02-03 00:00:00  0.000000e+00  0.000000e+00

[200 rows x 3 columns]


In [27]:
mape = calculate_mape(y_test, y_pred)
print(f"MAPE: {mape:.2f}%")

MAPE: 26.33%


In [28]:
medae = median_absolute_error(y_test, y_pred)
print(f"Median Absolute Error: {medae:.2f}")

Median Absolute Error: 377020.06


In [29]:
cod = calculate_cod(y_test, y_pred)
print(f"Coefficient of Determination: {cod:.2f}")

Coefficient of Determination: -0.36


In [30]:
evs = explained_variance_score(y_test, y_pred)
print(f"Explained Variance Score: {evs:.2f}")

Explained Variance Score: -0.23


In [31]:
model_summary = pd.DataFrame({
    'Metric': ['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error', 'R-squared',
                'Coefficient of Determination', 'Explained Variance Score',
               'Median Absolute Error'],
    'Value': [mae, mse, rmse, r2, cod, evs, medae]
})

In [32]:
model_summary.to_excel('Model_Summary.xlsx', index=False,float_format='%.2f')