Initially importing all necessary libraries that will be used in the code. Consequentially loading the data given into a dataframe and proceeding with the solution.

In [None]:
import pandas as pd
import numpy as np

In [None]:
import statsmodels.api as sm
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("c:/users/Asus/Downloads/CarPrice_Assignment.csv")

In [None]:
data.head()

In [None]:
data.info()

## We can see from the above output that there is no NULL value in any of the given tabular values.

In [None]:
# to understand the variations in the data

data.describe()

In [None]:
# Replacing the name of the "car + model" with the name of the "company of the car"

temp = data['CarName'].str.split(" ", n=2, expand = True)
data['CarName'] = temp[0]

In [None]:
# lets check if each car_ID is unique

boolean = data["car_ID"].duplicated().any()
boolean

In [None]:
# As there is no duplicated value in the car_ID, we can make it the index of the dataframe
# This also clarifies that there is no duplicated row value i.e no repitition of data

data = data.set_index("car_ID")

In [None]:
# Lets check the datatype of all the elements of each column

for i in range(len(list(data.columns))):
  print(data.columns[i],type(data.iloc[0,i]),data.iloc[0,i])

In [None]:
print(data["cylindernumber"].unique())
print(data["doornumber"].unique())

In [None]:
# We can see that doornumber and cylindernumber are two of those features which are actually numerical but are written down as strings
# Hence we convert those into numerals 


var_list =  ['doornumber', 'cylindernumber']

def str_to_num(x):
    if x.lower().strip()=="one":
        return 1
    elif x.lower().strip()=="two":
        return 2
    elif x.lower().strip()=="three":
        return 3
    elif x.lower().strip()=="four":
        return 4
    elif x.lower().strip()=="five":
        return 5
    elif x.lower().strip()=="six":
        return 6
    elif x.lower().strip()=="seven":
        return 7
    elif x.lower().strip()=="eight":
        return 8
    elif x.lower().strip()=="nine":
        return 9
    elif x.lower().strip()=="ten":
        return 10
    elif x.lower().strip()=="eleven":
        return 11
    elif x.lower().strip()=="twelve":
        return 12
    else:
        return 13

for var_col in var_list:
  data[var_col] = data[var_col].apply(lambda x: str_to_num(x))

In [None]:
# This shows how many unique values are there in each column

print(data.nunique())

In [None]:
# The output of this cell will be all the columns that are binary in nature, i.e two class columns
# Output will also show what are the values these columns hold

binary_columns = data.nunique() == 2
binary_columns = binary_columns[binary_columns.values]
print(binary_columns.index)
for col in binary_columns.index:
  print(data[col].unique())

In [None]:
# As "aspiration","enginelocation","fueltype" are the only remaining type of variable with two unique values and the are strings
# hence we replace them with 0 and 1

data['aspiration'] = data['aspiration'].apply(lambda x: 1 if x == 'turbo' else 0)
data['enginelocation'] = data['enginelocation'].apply(lambda x: 1 if x == 'rear' else 0)
data['fueltype'] = data['fueltype'].apply(lambda x: 0 if x == 'diesel' else 1)

In [None]:
# Lets clean the values of the CarName column and convert to lowercase and strip the string

data['CarName'] = data['CarName'].apply(lambda x: x.lower().strip())

In [None]:
# Lets check the unique values of the names of the cars

data["CarName"].unique()

In [None]:
# As we can see there are some obvious spelling mistakes in the names of the cars so lets rectify that

def map_car_name(name_car):
  if name_car == "porcshce":
    return "porsche"
  if name_car == 'maxda':
    return 'mazda' 
  if name_car == 'toyouta':
    return 'toyota'
  if name_car == 'vokswagen':
    return 'volkswagen'
  if name_car == 'vw':
    return 'volkswagen'
  else:
    return name_car

data["CarName"] = data["CarName"].apply(map_car_name)

In [None]:
# Lets check the variation of price based on the Companies of the cars

plt.figure(figsize=(20, 15))
sns.boxplot(x = 'CarName', y = 'price', data = data)

We can clearly see how various companies have a much higher median rate and even ranges of price than others, prompting us to believe that there is strong correlation of the CarName variable with the prices or at least it will play a role in determining the cost of the vehicle.

In [None]:
# Lets create dummy dataframes of categorical variables with multiple classes
# Starting with "CarName" and drop "nissan" as it does not seem significant in
# determining the price of the vehicle

d_1=pd.get_dummies(data['CarName'])
d_1.drop(['nissan'], axis = 1, inplace = True)

In [None]:
# lets check "carbody" and the classes in it

data["carbody"].unique()

In [None]:
# Lets check the variation of price based on the "carbody"

plt.figure(figsize=(10, 8))
sns.boxplot(x = 'carbody', y = 'price', data = data)

In [None]:
# lets create a dummy dataframe for "carbody" and drop hatchback as its contribution to price
# may not be significant

d_2=pd.get_dummies(data['carbody'])
d_2.drop(['hatchback'], axis = 1, inplace = True)

In [None]:
# lets check "enginetype" and the classes in it

data["enginetype"].unique()

In [None]:
# Lets check the variation of price based on the "enginetype"

plt.figure(figsize=(10, 8))
sns.boxplot(x = 'enginetype', y = 'price', data = data)

In [None]:
# dummy dataframe for "enginetype" and drop the first one "dohc" as its almost in the same range as a lot of other classes so we can't expect the dependence of price on it. 

d_3=pd.get_dummies(data['enginetype'])
d_3.drop(['dohc'], axis = 1, inplace = True)

In [None]:
# lets check "drivewheel" and the classes in it

data["drivewheel"].unique()

In [None]:
# Lets check the variation of price based on the "drivewheel"

plt.figure(figsize=(10, 8))
sns.boxplot(x = 'drivewheel', y = 'price', data = data)

In [None]:
# dummy dataframe for "drivewheel" and drop "4wd" as its almost in the same
# range as a lot of "fwd" class and hence keep one of those two. 

d_4=pd.get_dummies(data['drivewheel'])
d_4.drop(['4wd'], axis = 1, inplace = True)

In [None]:
# lets check "fuelsystem" and the classes in it

data["fuelsystem"].unique()

In [None]:
# Lets check the variation of price based on the "fuelsystem"

plt.figure(figsize=(10, 8))
sns.boxplot(x = 'fuelsystem', y = 'price', data = data)

In [None]:
# dummy dataframe for "fuelsystem" and drop "1bbl" as it and "2bbl" have almost same variance so the absence of that would not affect 
# the analysis and show no variation on prediction of the price

d_5=pd.get_dummies(data['fuelsystem'])
d_5.drop(['1bbl'], axis = 1, inplace = True)

It is important to drop variables which seem harmless to the performance of the model, which may even be the first variable that you may drop. This will negatively impact the performance of the model drastically and hence do not confuse the presence of one column equivalent of absence of it and presence of its counterparts.

In [None]:
# Now lets combine all the dummy matrices formed to the main matrix

data = pd.concat([data, d_1, d_2, d_3, d_4, d_5], axis = 1)

In [None]:
# Also drop the main Column from which these dummy columns were generated

data.drop(['CarName', 'carbody','enginetype', 'drivewheel','fuelsystem'],axis = 1, inplace = True)

In [None]:
# Lets observe how these new values fit in into the dataframe and see if there is any issue

data.info()

In [None]:
# Now that there is no issue with the data let's go ahead and create training and testing data

from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(data, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
# Now lets perform scaling of the training data in the forthcoming cells and check the values after the same

from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
# There should not be any difference in scaling all the values and just the numerical variables
# But still we would prefer performing the same on the numerical variables.

quantitative_var=["wheelbase","carlength","carwidth","carheight","curbweight","enginesize","boreratio","stroke",
                  "compressionratio","horsepower","peakrpm","citympg","highwaympg","price"]
df_train[quantitative_var] = scaler.fit_transform(df_train[quantitative_var])

In [None]:
df_train.head()

In [None]:
# Lets observe the common correlations of all the variables that we are considering in our model and draw relevant conclusions

plt.figure(figsize = (50, 50))
sns.heatmap(df_train.corr(), annot = True, cmap="RdYlGn")
plt.show()

The conclusions we can draw from these values is that the variable highly co-related to "price" are:
["+mpfi", "-2bbl", "+rwd", "-fwd", "-citympg","-highwaympg", "+horsepower", "+boreratio", "+enginesize", "+cylindernumber", "+curbweight", "+carwidth", "+carlength", "+wheelbase"]

The sign indicates positive or negative correlation so we can expect the presence of some of these varibales including some car names in the final model.

Also to be noted is that the following variables are highly correlated to each other:
*   wheelbase, carlength, carwidth, carheight, curbweight, enginesize, boreratio
*   citympg, highwaympg

There is another observation on "mercury","spfi","mfi as there are just no correlations plotted for these variables



In [None]:
# lets just observe the variation of price with quantitative variables more closely to understand where multicolinearity might occur

plt.figure(figsize = (15, 15))
sns.heatmap(df_train[quantitative_var].corr(), annot = True, cmap="RdYlGn")
plt.show()

We can again see the same observations as before and hence draw a few conclusions:

*   In the event when highly correlated variables clash(holding opposite signs of coefficients or being eliminated on the basis of p-values or VIF), we must decide to remove the variable with the least correlation to price.
*   When we observe that in our final model, correlated values hold opposite signs which is counterintuitive, we can conclude the model in innacurate for inference.



In [None]:
# Lets start the training by first creating the training data

y_train = df_train.pop('price')
X_train = df_train

In [None]:
X_train.describe()

As earlier raised by the correlation matrix, the empty lines in "mercury", "spfi", "mfi" are as they have entirely no values other than 0. We can remove these values from the model as they do not bring any variation that it can learn

In [None]:
# Now we will begin with the linear regression model using statsmodel.api,  observe the statistics offered by it and VIF
# to eliminate variables that may seem otherwise insignificant based on p-values/t-values.
# The elimination will sometimes be based on VIF values and predominantly on p-values/t-values.

# The proceedure being followed is manual feature elimination and any variable that shows the highest p-value(>=0.05) and
# highest VIF(strictly >=10 and moderately for 10>=var>=5) will be eliminated. 

# There will also be considerable check on the coefficients to see if the model is either counter-intuitive or 
# not interested in that variable and be eliminated.

# This process will continue till both the p-value and VIF criteria are satisfied at least moderately while preserveing 
# performance on training and will finally be tested using test dataset.

X = X_train.copy()
X_train_lm = sm.add_constant(X)

lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns=['ohcv'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
#coefficients are 0 for mfi and spfi and so are all the values of those columns in the training set, so we should remove them.

X = X.drop(columns = ['mfi','spfi'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = ['jaguar'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = ['mpfi'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = ['spdi'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns=['symboling'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = ['rwd'])
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'isuzu')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'citympg')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'mercury')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'hardtop')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'volkswagen')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'subaru')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'audi')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'doornumber')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'mazda')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'toyota')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'alfa-romero')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'renault')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'fueltype')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'horsepower')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = '4bbl')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'compressionratio')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'idi')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'fwd')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'carlength')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'wagon')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'sedan')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'cylindernumber')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'dohcv')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'stroke')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'curbweight')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'highwaympg')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'chevrolet')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'carwidth')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'volvo')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'honda')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'wheelbase')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'enginelocation')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = '2bbl')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'plymouth')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'dodge')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'boreratio')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'carheight')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'peugeot')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'ohcf')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X = X.drop(columns = 'l')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
X = X.drop(columns = 'ohc')
X_train_lm = sm.add_constant(X)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.summary()

In [None]:
vif = pd.DataFrame()
vif['Columns'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Now we have come to that point in our model where we have p-values for all the variables <=0.05(exception of "convertible", with very small deviation)
# and VIF of all the variables stricly <5 and hence the remaining features will be considered in the final model

X.columns

In [None]:
# These are the variables that our final model will consider

manual_sequential_elimination_var = X.columns

In [None]:
# These are the predicted values on training data for residual evaluation

y_train_price = lr.predict(X_train_lm)

In [None]:
# Using this we can finally observe the dispersion of error across "0" and whether it fits in a Normal distrubution

fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 15)                 
plt.xlabel('Errors', fontsize = 12)                        

It is observed that the error terms do form a normal distrubution about the origin and fit distrubution well enough, and hence the model performs well without any biases towards training.

In [None]:
# Lets first perform scaling on test data by the "scaler" previously defined for training data

df_test[quantitative_var] = scaler.transform(df_test[quantitative_var])

In [None]:
df_test.describe()

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
# Slicing testing data using the variables of the final model, adding constant and predicting the final values
# of the regression

X_test_sem = X_test[manual_sequential_elimination_var]
X_test_sem = sm.add_constant(X_test_sem)
y_pred_sem = lr.predict(X_test_sem)

In [None]:
# These are the plots of the predicted value v/s the actual values of test_data

fig = plt.figure()
plt.scatter(y_test, y_pred_sem)
fig.suptitle('y_test vs y_pred', fontsize = 20)               
plt.xlabel('y_test', fontsize = 18)                          
plt.ylabel('y_pred', fontsize = 16)  

## R2 Score

In [None]:


from sklearn.metrics import r2_score
r2_score(y_test, y_pred_sem)

# Conclusion Remarks

Based on all this excercise, we can say that our model depends on:

*   'enginesize' by a factor of '1.0521' 
*   'aspiration' by a factor of '0.1000'
*   'peakrpm' by a factor of '0.099' 
*   'bmw' by a factor of '0.2574'
*   'buick' by a factor of '0.1772'
*   'mitsubishi' by a factor of '-0.0999' 
*   'porsche' by a factor of '0.2352'
*   'rotor' by a factor of '0.2156' 
*   'saab' by a factor of '0.0803'
*   'convertible' by a factor of '0.0681' 
*   a constant of '-0.1177' 

The signs are an indication of the effect the variables have on the prices and the coefficients signify the magnitude of impact on the same. We can undoubtedly say that the brand-name of the companies do impact the price of the car and most probably even the quality of the car.

Also based on the final R-squared(= 0.907, adj R-sq = 0.900) of the training data and the final R-squared(= 0.894) of the testing data we can say that the model is not overfitted on the training data as the variation between them is negligible.



## Final Equation

The equation of the hyperplane formed by our model is:

PRICE = (*ENGINESIZE*)1.0521 +(*ASPIRATION*)0.1 +(*PEAKRPM*)0.099 +(*BMW*)0.2574 +(*BUICK*)0.1772 -(*MITSUBISHI*)0.0999 + (*PORSCHE*)0.2352 +(*ROTOR*)0.2156 +(*SAAB*)0.0803 +(*CONVERTIBLE*)0.0681 - 0.1177