In [1]:
import sys 
import io
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns 
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split, cross_val_score
# switch to seaborn default stylistic parameters
sns.set()
sns.set_context('notebook') 
#Read the Kuiper's 2008 car data

#We'll drop the Model and Trim features, as we are interested in making predictions without identifying the exact kind of car.



In [2]:
url="https://raw.githubusercontent.com/grbruns/cst383/master/kuiper-2008-cars.csv"
raw=requests.get(url).content
df=pd.read_csv(io.StringIO(raw.decode('utf-8')))
df.drop(['Model', 'Trim'], inplace=True, axis=1)

In [3]:
# Converting all categorical variables to numeric by Using pandas.get_dummies

In [4]:
print("shape before",df.shape)

shape before (804, 10)


In [5]:
df=pd.get_dummies(df)

In [6]:
print("shape after",df.shape)

shape after (804, 19)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             804 non-null    float64
 1   Mileage           804 non-null    int64  
 2   Cylinder          804 non-null    int64  
 3   Liter             804 non-null    float64
 4   Doors             804 non-null    int64  
 5   Cruise            804 non-null    int64  
 6   Sound             804 non-null    int64  
 7   Leather           804 non-null    int64  
 8   Make_Buick        804 non-null    uint8  
 9   Make_Cadillac     804 non-null    uint8  
 10  Make_Chevrolet    804 non-null    uint8  
 11  Make_Pontiac      804 non-null    uint8  
 12  Make_SAAB         804 non-null    uint8  
 13  Make_Saturn       804 non-null    uint8  
 14  Type_Convertible  804 non-null    uint8  
 15  Type_Coupe        804 non-null    uint8  
 16  Type_Hatchback    804 non-null    uint8  
 1

In [8]:
#Let's build a linear model in which we use all of the features as predictors, and look at the values of all the coefficients.

#@ Make a model with all features. First, create X and y where y contains 'Price' values and X contains the other columns of df. Next, perform a test train split to get X_train, X_test, etc. Then create a linear model using LinearRegression.  Call your model reg5. Finally, print the coefficients of your model (use a loop in printing all coefficients except the intercept).

In [9]:
y=df['Price']
X=df.drop(['Price'],axis=1)

In [10]:
print(X.shape)
print(y.shape)

(804, 18)
(804,)


In [11]:
#Performed train test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [13]:
# Build linear regression model

In [14]:
reg5 = LinearRegression()
reg5.fit(X_train, y_train)

LinearRegression()

In [15]:
#printing all co-efficient except intercept

In [16]:
print("Intercept",reg5.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, reg5.coef_[idx]))

Intercept 24381.25120421961
The coefficient for Mileage is -0.18866401986252573
The coefficient for Cylinder is -1083.7314516474241
The coefficient for Liter is 5516.671090142473
The coefficient for Doors is -2111.1571748414913
The coefficient for Cruise is 46.92629184544157
The coefficient for Sound is 312.91010690041156
The coefficient for Leather is 201.4515278548885
The coefficient for Make_Buick is -3605.154414258059
The coefficient for Make_Cadillac is 12681.304751536256
The coefficient for Make_Chevrolet is -5526.686848128268
The coefficient for Make_Pontiac is -5406.748381338428
The coefficient for Make_SAAB is 6739.172934639635
The coefficient for Make_Saturn is -4881.888042451127
The coefficient for Type_Convertible is 6645.217168619957
The coefficient for Type_Coupe is -5589.6385811992095
The coefficient for Type_Hatchback is -1724.2878207654387
The coefficient for Type_Sedan is -1630.9970931924336
The coefficient for Type_Wagon is 2299.706326537131


In [17]:
##@  Print the r-squared value for your model based on the training data and also print the RMSE based on the test data.

In [18]:
y_pred = reg5.predict(X_test)
r2_score=r2_score(y_test,y_pred)
print("r2_score",r2_score)

r2_score 0.9377078006373982


In [19]:
mse = mean_squared_error(y_test, y_pred)
from math import sqrt
rms = sqrt(mse)
print("Root mean squared",rms)

Root mean squared 2346.2905173925437


### Polynomial linear regression model

In [20]:
#We can add even more features to the model by adding derived features. Let's try the PolynomialFeatures class.

#@ From your NumPy array X create an extended data set X_poly using PolynomialFeatures with degree=2.
#Assign your PolynomialFeatures object to variable pf.

In [23]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X)

In [25]:
## reg6 model creation

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=1)

In [27]:
# Build linear regression model

In [28]:
reg6 = LinearRegression()
reg6.fit(X_train, y_train)

LinearRegression()

In [15]:
#printing all co-efficient except intercept

In [31]:
y_pred = reg6.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
from math import sqrt
rms = sqrt(mse)
print("Root mean squared",rms)

Root mean squared 1602.6613457057192


In [32]:
X_0 = X_train[:,[0]]

In [33]:
# compute negated mean square error scores using 5-fold cross validation
scores = cross_val_score(LinearRegression(), X_0, y_train, scoring='neg_mean_squared_error', cv=5)

In [34]:
# work out the average root mean squared error.  We need to first negate the scores, because they are negative MSE, not MSE.
rmse = np.sqrt(-scores.mean())

In [35]:
print('RMSE for feature 0 only: {:.2f}'. format(rmse))

RMSE for feature 0 only: 10087.54


In [38]:
#RMSE for each scoreand K-fold

In [74]:
pf = pd.DataFrame([])
kfold = KFold(n_splits=5,shuffle=True, random_state=3)
results = cross_val_score(reg6, X_train,y_train, scoring='neg_mean_squared_error', cv=kfold)
print(results)
for i,j in enumerate(results):
    rmse_=np.sqrt(-results[i].mean())
    pf.append(pd.DataFrame({'i_min': i,'rmse_min':rmse_}, index=[0]), ignore_index=True)
    print("Rmse",rmse_)

[-2938027.39378651 -2259773.28294089 -2272890.58908102 -2550797.38765178
 -4366738.55350587]
Rmse 1714.0674997754647
Rmse 1503.2542309738876
Rmse 1507.6108878225252
Rmse 1597.121594510506
Rmse 2089.674269714271
