## Car Data: Regression Analysis
Predicting price of cars

### Loading libraries

In [None]:
# importing libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from plotnine import *
import seaborn as sns

### Loading the Dataset

In [None]:
# # loading dataset
# from google.colab import files
# uploaded = files.upload()

In [None]:
# reading the dataset
cars = pd.read_csv('cars.csv')

### Descriptive Analytics

In [None]:
# first few rows
cars.head()

In [None]:
# doors distribution
cars.doors.value_counts()

In [None]:
# revmoving 2 doors cars
cars = cars.query('doors > 2')

In [None]:
# doors distribution
cars.doors.value_counts()

In [None]:
# last few rows
cars.tail()

In [None]:
# dataset info
cars.info()

In [None]:
# unique values
cars.nunique()

In [None]:
# changing to categorical
cars.metallic_color = cars.metallic_color.astype('category')

In [None]:
# changing to categorical
cars.automatic = cars.automatic.astype('category')

In [None]:
# changing to categorical
cars.doors = cars.doors.astype('category')

In [None]:
# dataset info
cars.info()

### Graphical Summary

In [None]:
# price histogram
ggplot(cars, aes(x = 'price')) + geom_histogram(fill = 'lightblue', color = 'black')

In [None]:
# scatterplot price vs age
ggplot(cars, aes(x='age', y='price')) + geom_point(color = 'red')

In [None]:
# scatterplot price vs km
ggplot(cars, aes(x='km', y='price')) + geom_point(color = 'blue')

In [None]:
# HW scatterplot price vs hp
ggplot(cars, aes(x='hp', y='price')) + geom_point(color = 'green')

In [None]:
# HW scatterplot price vs cc
ggplot(cars, aes(x='cc', y='price')) + geom_point(color = 'yellow')

In [None]:
# HW scatterplot price vs tax
ggplot(cars, aes(x='tax', y='price')) + geom_point(color = 'purple')

In [None]:
# HW scatterplot price vs weight
ggplot(cars, aes(x='weight', y='price')) + geom_point(color = 'gold')

In [None]:
# boxplot price vs fuel_type
ggplot(cars, aes(x = 'fuel_type', y = 'price', color = 'fuel_type')) + geom_boxplot()

In [None]:
# boxplot price vs metallic_color
ggplot(cars, aes(x = 'metallic_color', y = 'price', color = 'metallic_color')) + geom_boxplot()

In [None]:
# boxplot price vs automatic
ggplot(cars, aes(x = 'automatic', y = 'price', color = 'automatic')) + geom_boxplot()

In [None]:
# boxplot price vs doors
ggplot(cars, aes(x = 'doors', y = 'price', color = 'doors')) + geom_boxplot()

In [None]:
# correlation matrix
cars.corr()

In [None]:
# scatterplot matrix
sns.pairplot(cars, diag_kind= 'kde')

### Predictive Analytics: Choosing Predictors

In [None]:
# choose predictors
predictors = ['age', 'km', 'weight', 'fuel_type','metallic_color','doors']

In [None]:
# target variable
target = 'price'

In [None]:
# creating dummy variables
X = pd.get_dummies(cars[predictors], drop_first = True)
X.head()

In [None]:
# target variable
y = cars[target]
y[0:5]

### Predictive Analytics: Creating Training and Test Datasets

In [None]:
# creating train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
X_train.head()
X_train.info()

In [None]:
y_train.head()

In [None]:
X_val.info()

In [None]:
y_val.head()

In [None]:
# standardizing the predictors
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_val_std = sc.transform(X_val)

### Linear Regression Model

In [None]:
# import the libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# linear model
lin_reg = LinearRegression()

In [None]:
# train the model
lin_reg.fit(X_train, y_train)

In [None]:
# prediction
y_pred = lin_reg.predict(X_val)

In [None]:
# MSE mean square error
mean_squared_error(y_val, y_pred)

In [None]:
# RMSE squareroot of MSE
mean_squared_error(y_val, y_pred) ** 0.5

In [None]:
# R squared(0, 1)
r2_score(y_val, y_pred)

In [None]:
# plot y_val vs y_pred
plt.scatter(y_val, y_pred)

### Crossvalidation Regression

In [None]:
# standardizing the predictor
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

In [None]:
# Crossvalidation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, X_std, y, cv = 10, scoring ='neg_root_mean_squared_error')
print(scores * -1)
print('Mean RMSE', scores.mean()*-1)