In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact, interactive, fixed, interact_manual
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/module_5_auto.csv'

df = pd.read_csv(path)
df.to_csv('module_5_auto.csv')

df = df._get_numeric_data()
print(df.head())

def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    ax1 = sns.kdeplot(RedFunction, color="r", label=RedName)
    ax2 = sns.kdeplot(BlueFunction, color="b", label=BlueName, ax=ax1)
    
    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cards')
    plt.show()
    plt.close()
    
def PollyPlot(xtrain, xtest, y_train, y_test, lr, poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    #training data
    #testing data
    #lr: linear regression object
    #poly_transform: polynomial transformation object
    
    xmax = max([xtrain.values.max(), xtest.values.max()])
    
    xmin = min([xtrain.values.min(), xtest.values.min()])
    
    x = np.arange(xmin, xmax, 0.1)
    
    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()

#SECTION - Part 1: Training and Testing
# An important step in testing your model is to split your data into training and testing data. We will place the target data price in a separate dataframe y_data:

y_data = df['price']

x_data = df.drop('price', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)

print('Number of test samples: ', x_test.shape[0])
print('Number of training samples: ', x_train.shape[0])

# The test_size parameter sets the proportion of data that is split into the testing set. In the above, the testing set is 10% of the total dataset.

#SECTION - Question #1): 
# Use the function 'train_test_split' to split up the dataset such that 4-% of the data samples will be utilized for testing. Set the parameter 'random_state' equal to zero. The output of the function should be the following: 'x_train1', 'x_test1', 'y_train1', and 'y_test1'.

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.40, random_state=0)

print('Number of test samples: ', x_test1.shape[0])
print('Number of training samples: ', x_train1.shape[0])

#Create a Linear Regression object:
lre = LinearRegression()

# We fit the model using the feature 'horsepower':
lre.fit(x_train[['horsepower']], y_train)

# Let's calculate the R^2 on the test data:
lre.score(x_test[['horsepower']], y_test)

# We can see the R^2 is much smaller using the test data compared to the training data.
lre.score(x_train[['horsepower']], y_train)

#!SECTION

#SECTION - Question #2):
# Find the R^2 on the test data using 40% of the dataset for testing.
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.40, random_state=0)

lre.fit(x_train1[['horsepower']], y_train1)
lre.score(x_train1[['horsepower']], y_train1)

#!SECTION

# Sometimes you do not have sufficient testing data; as a result, you may want to perform cross-validation. Let's go over several methods that you can use for cross-validation.

#SECTION - Cross-Validation Score
# Let's import cross_val_score from the module model_selection.

# We input the object, the feature ('horsepower'), and the target data (y_data). The parameter 'cv' determines the number of folds. In this case, it is 4.
Rcross = cross_val_score(lre, x_data[['horsepower']], y_data, cv = 4)

# The default scoring is R^2. Each element in the array has the average R^2 value for the fold:
print(Rcross)

# We can calculate the average and standard deviation of our estimate:
print("The mean of the folds are ", Rcross.mean(), " and the standard deviation is ", Rcross.std())

# We can use negative squared error as a score by setting the parameter 'scoring' metric to 'neg_mean_squared_error'.
-1 * cross_val_score(lre, x_data[['horsepower']], y_data, cv = 4, scoring = 'neg_mean_squared_error')

#SECTION - Question #3):
# Calculate the average R^2 using two folds, then find the average R^2 for the second fold utilizing the 'horsepower' feature:
Rc = cross_val_score(lre, x_data[['horsepower']], y_data, cv = 2)
Rc.mean()

# You can also use the function 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, with one fold for testing and the other folds are used for training.

# We input the object, the feature 'horsepower', and the target data y_data. The parameter 'cv' determines the number of folds. In this case, it is 4. We can produce an output:

yhat = cross_val_predict(lre, x_data[['horsepower']], y_data, cv = 4)
yhat[0:5]

#!SECTION

#!SECTION

#!SECTION

#SECTION - Part 2: Overfitting, Underfitting and Model Selection
# It turns out that the test data, sometimes referred to as the 'out of sample data', is a much better measure of how well your model performs in the real world. One reason for this is overfitting.
#!SECTION

   Unnamed: 0.1  Unnamed: 0  symboling  ...  city-L/100km  diesel  gas
0             0           0          3  ...     11.190476       0    1
1             1           1          3  ...     11.190476       0    1
2             2           2          1  ...     12.368421       0    1
3             3           3          2  ...      9.791667       0    1
4             4           4          2  ...     13.055556       0    1

[5 rows x 21 columns]
Number of test samples:  21
Number of training samples:  180
Number of test samples:  81
Number of training samples:  120
[0.7746232  0.51716687 0.74785353 0.04839605]
The mean of the folds are  0.522009915042119  and the standard deviation is  0.291183944475603


array([14141.63807508, 14141.63807508, 20814.29423473, 12745.03562306,
       14762.35027598])