In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from time import time

In [3]:
# Loading the data

df = pd.read_csv('../data/preprocessed/newborntrain_processed.csv', sep=";")
#print(df.head())

# Removing the target variable from the feature

X = df.drop("newborn_weight", axis=1)
Y = df["newborn_weight"]
print(X.shape)
print(Y.shape)
print(df.shape)

(2398116, 17)
(2398116,)
(2398116, 18)


In [4]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=12345)


model = LinearRegression()

#Fitting the model to training Data

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

# Calculate MAPE
def MAPE(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

# Calculate MAPE for the predictions
mape = MAPE(Y_test, Y_pred)

# Print MAPE
print("MAPE:", mape)
print("___________________________________________________")
print("Predictions:", Y_pred)
print("Test Labels:", Y_test)

MAPE: 16.334670509013254
___________________________________________________
Predictions: [2914.66001744 3287.80322058 3269.25002464 ... 3338.7281342  3424.42564003
 3107.3356979 ]
Test Labels: 1031901    3260
741167     2570
296333     3035
2259380    2650
2311228    3212
           ... 
695416     4065
947094     2380
248881     2977
493071     3588
1370920    2758
Name: newborn_weight, Length: 479624, dtype: int64


## GridSearch

In [5]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid[0].keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i in range(len(grid_search.cv_results_['params'])):
        new_row = grid_search.cv_results_['params'][i] | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                                                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                                                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                                                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df

In [8]:
grid_search.cv_results_['params'][0],grid_search.cv_results_['std_test_score']

({'copy_X': True, 'fit_intercept': True, 'normalize': True},
 array([0.0005991 , 0.00059872, 0.00059872, 0.00059872, 0.0005991 ,
        0.00059872, 0.00059872, 0.00059872]))

In [7]:
start_time = time()
# Grid 
param_grid = [{
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'copy_X': [True, False]
}]
# Cross Validation
rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring="neg_mean_absolute_percentage_error", 
                            n_jobs=-1, cv=rkf, return_train_score=True)
grid_search.fit(X,Y)
print("--- %s seconds ---" % (time() - start_time))



--- 321.79034399986267 seconds ---


In [10]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./Validation/LR_metrics_2_5_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,fit_intercept,normalize,copy_X,mean_train_score,std_train_score,mean_test_score,std_test_score
1,True,False,True,-0.16312,0.000148,-0.163121,0.000599
2,False,True,True,-0.16312,0.000148,-0.163121,0.000599
3,False,False,True,-0.16312,0.000148,-0.163121,0.000599
5,True,False,False,-0.16312,0.000148,-0.163121,0.000599
6,False,True,False,-0.16312,0.000148,-0.163121,0.000599
7,False,False,False,-0.16312,0.000148,-0.163121,0.000599
0,True,True,True,-0.16312,0.000148,-0.163121,0.000599
4,True,True,False,-0.16312,0.000148,-0.163121,0.000599


1. To check over fitting, the standard deviation of scores between the training and test sets must be very low. 
2. Main focus is too low error on train and higher error on test provides overfitting.

In this case, mean_train and test_score are same, and then std is very small in train than test. Thus splits are having same error percentage as original