## Model Evaluation and Refinement

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
# data
auto_csv = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/module_5_auto.csv'

df = pd.read_csv(auto_csv)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,0,0,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,1,1,3,122,alfa-romero,std,two,convertible,rwd,front,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,2,2,1,122,alfa-romero,std,two,hatchback,rwd,front,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,3,3,2,164,audi,std,four,sedan,fwd,front,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,4,4,2,164,audi,std,four,sedan,4wd,front,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


In [4]:
#use only numeric data
df = df._get_numeric_data()
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,...,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
0,0,0,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,0,1
1,1,1,3,122,88.6,0.811148,0.890278,48.8,2548,130,...,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,0,1
2,2,2,1,122,94.5,0.822681,0.909722,52.4,2823,152,...,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,0,1
3,3,3,2,164,99.8,0.84863,0.919444,54.3,2337,109,...,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,0,1
4,4,4,2,164,99.4,0.84863,0.922222,54.3,2824,136,...,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,0,1


In [5]:
# removed the unnamed cols
df.drop(['Unnamed: 0.1','Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
0,3,122,88.6,0.811148,0.890278,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,11.190476,0,1
1,3,122,88.6,0.811148,0.890278,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,11.190476,0,1
2,1,122,94.5,0.822681,0.909722,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,12.368421,0,1
3,2,164,99.8,0.84863,0.919444,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,9.791667,0,1
4,2,164,99.4,0.84863,0.922222,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,13.055556,0,1


In [7]:
#plotting libraries
from ipywidgets import interact, interactive, fixed, interact_manual

## Plotting functions

In [8]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    ax1 = sns.kdeplot(RedFunction, color="r", label=RedName)
    ax2 = sns.kdeplot(BlueFunction, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')
    plt.show()
    plt.close()

In [9]:
def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    
    #training data 
    #testing data 
    # lr:  linear regression object 
    #poly_transform:  polynomial transformation object 
 
    xmax=max([xtrain.values.max(), xtest.values.max()])

    xmin=min([xtrain.values.min(), xtest.values.min()])

    x=np.arange(xmin, xmax, 0.1)


    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()

## Training and Testing

In [10]:
# put the target data in a separate dataframe
y_data = df['price']

In [11]:
# drop price data in x data
x_data = df.drop('price', axis=1)

### Split the data into traing and testing 

In [12]:
# split the data into training and testing data randomly
from sklearn.model_selection import train_test_split

# 10% of the data will be used for testing
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)

print("number of test samples :", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

number of test samples : 21
number of training samples: 180


Split the dataset such that 40% of the data samples will be used for testing & the random state = 0.

In [13]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.40, random_state=0)

print("number of test samples :", x_test1.shape[0])
print("number of training samples:",x_train1.shape[0])

number of test samples : 81
number of training samples: 120


In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
#create a linear regression object
lre=LinearRegression()
#fit the model using horsepower
lre.fit(x_train[['horsepower']], y_train)
#calculate the R^2 on the test data
r2_test = lre.score(x_test[['horsepower']], y_test)
r2_train = lre.score(x_train[['horsepower']], y_train)

print('the R-square value of the test data is: ', r2_test)
print('the R-square value of the train data is: ', r2_train)

the R-square value of the test data is:  0.36358755750788263
the R-square value of the train data is:  0.6619724197515104


R-square is higher in train data than test data - model does not generalize well to new data - overfitting?

Overfitting occurs when a model learns not only the underlying patterns in the training data but also the noise and random fluctuations. This leads to excellent performance on the training data but poor generalization to new, unseen data.

Find the Rsquared on the test data using 40% of the dataset for tesing.

In [16]:
#create a linear regression object
lre=LinearRegression()
#fit the model using horsepower
lre.fit(x_train1[['horsepower']], y_train1)
#calculate the R^2 on the test data
r2_test1 = lre.score(x_test1[['horsepower']], y_test1)
r2_train1 = lre.score(x_train1[['horsepower']], y_train1)

print('the R-square value of the test data with 40% of the data set used for testing is: ', r2_test1)
print('the R-square value of the train data with 40% of the data set used for testing is: ', r2_train1)

the R-square value of the test data with 40% of the data set used for testing is:  0.7139364665406973
the R-square value of the train data with 40% of the data set used for testing is:  0.5754067463583004


# Cross-Validation

In [17]:
from sklearn.model_selection import cross_val_score

input the object, the feature ("horsepower"), and the target data (y_data). The parameter 'cv' determines the number of folds. In this case, it is 4.

In [18]:
Rcross = cross_val_score(lre, x_data[['horsepower']], y_data, cv=4)

The default scoring is R^2. Each element in the array has the average R^2 value for the fold:

In [19]:
Rcross

array([0.7746232 , 0.51716687, 0.74785353, 0.04839605])

Calculate the mean and standard deviation of the estimate

In [20]:
print("The mean of the folds are", Rcross.mean(), "and the standard deviation is" , Rcross.std())

The mean of the folds are 0.522009915042119 and the standard deviation is 0.291183944475603


We can use negative squared error as a score by setting the parameter 'scoring' metric to 'neg_mean_squared_error'.

In [21]:
-1 * cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')

array([20254142.84026702, 43745493.2650517 , 12539630.34014931,
       17561927.72247591])