# C964 - Randeep Jalli
## Vehicle Price prediction



In [None]:
try:
    import pandas as pandas_instance
    import seaborn as seaborn_instance
    import matplotlib.pyplot as pyplot_instance
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
    from sklearn.linear_model import LinearRegression
    import ipysheet
    import ipywidgets
except SyntaxError:
    pass




_________________

## Data Ingest


To begin with we ingest our data from the comma-separated value file included with this project.
A few rows and columns are included below for reference.

_________________

In [None]:
dataFrame= pandas_instance.read_csv('data/car data.csv')
dataFrame.head()


_________________


Let's see what the shape of the dataset is.


In [None]:
dataFrame.shape


_________________


In order to clean the dataset, we replace values with equivalents that are already reflected elsewhere in the data.



_________________


In [None]:
dataFrame['Seller_Type'].replace(['Ebay Motors Merchant'], 'Dealer')
dataFrame['Transmission'].replace(['6-Speed'], 'Manual')


_________________

Below we can see all the different unique feature types available in the dataset.



_________________



In [None]:
for feature in ['Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']:
    print("__________________________________________________________________________________________________")
    print("Feature Type: " + feature)
    print("Unique Values: " + str(dataFrame[feature].unique()))
    print("__________________________________________________________________________________________________")





_________________

Check for a Dataset clear of null or "Invalid/NotANumber" data.






In [None]:
dataFrame.isnull().sum()


_________________

The Dataframe we created is shown below:

In [None]:
dataFrame.describe()

In [None]:
dataFrame.columns


_________________

## Data Cleaning
In order to predict the final price we need to select only the columns we want to be in our new DataFrame.



In [None]:

cleaned_dataset=dataFrame[['Year','Selling_Price','Present_Price','Kms_Driven','Fuel_Type','Seller_Type','Transmission','Owner']]




_________________


## Data Wrangling

In order to reflect the age of the car, we need to create a custom column and subtract from the current year.

In [None]:

cleaned_dataset['Current_Year']=2020
cleaned_dataset['age']=cleaned_dataset['Current_Year']-cleaned_dataset['Year']



A sample of the rows from our cleaned DataSet that now includes the age column.

In [None]:
cleaned_dataset.head()




We need to remove the current year from our DataSet.

In [None]:
cleaned_dataset.drop(['Year','Current_Year'],axis=1, inplace=True)

In [None]:
cleaned_dataset.head()




_________________

## Data Encoding

In order to allow the model to predict the value of the categorical features, we perform a "hot encoding" and change the categorical values to numbers.



In [None]:
cleaned_dataset= pandas_instance.get_dummies(cleaned_dataset,drop_first=True)

_________________

A sample of the rows from our encoded DataFrame.

In [None]:
cleaned_dataset.head()




_________________




## Sampled Columns
A sample of the columns from our encoded DataFrame.


In [None]:
cleaned_dataset.corr()


_________________




## Pair Plot


Below we see a Pair Plot, showing various correlative measures of our data.


In [None]:
pair_plot = seaborn_instance.pairplot(cleaned_dataset).fig.suptitle('Pair Plot of Columns and Correlations', fontsize=50, weight='bold', y=1.1)
#air_plot.fig.subplots_adjust(top=0.9)
#pair_plot.fig.suptitle('Title', fontsize=16)



_________________


## Heatmap

In this Heatmap the Yellow features have little correlation to increased selling price, the Blue features have a high correlation to increased selling price.



In [None]:


%matplotlib inline

correlation_matrix = cleaned_dataset.corr()
top_features = correlation_matrix.index
pyplot_instance.figure(figsize=(20,20))
heatmap=seaborn_instance.heatmap(cleaned_dataset[top_features].corr() ,annot=True, cmap="YlGnBu").set_title('Heatmap Of Features')

_________________



## Data Seperation

Below we separate our dataset into depended and independent variables.

In [None]:
# Independent and Dependent features
dependentFeature= cleaned_dataset.iloc[:,1:]
independentFeature= cleaned_dataset.iloc[:,0]



_________________

In [None]:
#dependentFeature.head()

In [None]:
#independentFeature.head()



_________________

## Feature Analysis

We create a Extra-Tree's regressor that implements a set of randomized decision tree's to show the importance of the various features.


In [None]:


model = ExtraTreesRegressor()
model.fit(dependentFeature,independentFeature)


In [None]:
print(model.feature_importances_)






_________________

## Important Features
Below we can see a graph of the various important features and a list of the top 5 most correlated features to selling price.

In [None]:

importantFeatures= pandas_instance.Series(model.feature_importances_, index=dependentFeature.columns)
importantFeatures.nlargest(5).plot(kind='barh')
pyplot_instance.show()

_________________



In order to gather enough data for testing the model, we split the dataset into 80% train and 20% test data.





In [None]:

X_train,X_test, y_train,y_test= train_test_split(dependentFeature,independentFeature,test_size=0.2)

In [None]:
X_train.shape





_________________


Below we can see the main logic of the model, the Linear Regression function is called to create a trained model.

In [None]:

simple_regression= LinearRegression()

simple_regression.fit(X_train,y_train)



_________________




We create a prediction from the trained model using the test data we created above.

In [None]:
y_pred= simple_regression.predict(X_test)




_________________


We show an overlay distribution plot of the delta between the test dataset and the predited dataset.
This represents the error.

In [None]:
seaborn_instance.distplot(y_test-y_pred)

In [None]:
error = r2_score(y_test, y_pred)
print("Accuracy: " + str(round(error*100,2)))


_________________



# Intelligent Vehicle Price Predictor
## Calculator

Below we have our Selling Price Prediction Calculator.
This Calculator allows the user to predict the selling price of their car by tweaking the value for various features.
In order to predict the selling price for a car, please change a value in one of the fields and press enter.
The Final Price value will auto populate. Please note you MUST use the units that are noted in the feilds below. 


In [None]:
sheet = ipysheet.sheet(rows=8, columns=2, column_headers=False, row_headers=False)
present_price = ipysheet.cell(0, 1, 4.2, label_left='Present Price in Ten Thousand Dollar Increments, i.e 4.2 is $42000', type='numeric')
Kms_Driven = ipysheet.cell(1, 1, 10000, label_left='Kilometers Driven ', type='numeric')
Owner = ipysheet.cell(2, 1, 3, label_left='Number of Previous Owners in Integer, i.e 1 is 1 Previous Owner', type='numeric')
age = ipysheet.cell(3, 1, 3, label_left='Number of Years Old in Integer, i.e 1 is 1 year old', type='numeric')
fuel_type = ipysheet.cell(4, 1, 0, label_left='Fuel Type ( use 1 for Diesel 0 for Gasoline)', type='numeric')
seller_type = ipysheet.cell(5, 1, 0, label_left='Fuel Type ( use 1 for Individual 0 for Other)', type='numeric')
transmission_type = ipysheet.cell(6, 1, 1, label_left='Gearbox Type ( use 1 for Manual 0 for Automatic)', type='numeric')
price = ipysheet.cell(7, 1, 51565.26, label_left='Final Price', read_only=True)



def calculate(change):
    tempdata = {'Present_price': [present_price.value],
        'Kms_Driven': [Kms_Driven.value],
        'Owner': [Owner.value],
        'age': [age.value],
        'Seller_Type_Individual': [seller_type.value],
        'Transmission_Manual': [transmission_type.value]}
    if fuel_type.value == 1:
        tempdata['Fuel_Type_Diesel'] = [1]
        tempdata['Fuel_Type_Petrol'] = [0]
    elif fuel_type.value == 0:
        tempdata['Fuel_Type_Diesel'] = [0]
        tempdata['Fuel_Type_Petrol'] = [1]
    simple_regression.fit(X_train,y_train)
    temp_test = pandas_instance.DataFrame.from_dict(tempdata)
    temp_pred = simple_regression.predict(temp_test)
    price.value = round(temp_pred[0] * 10000, 2)

present_price.observe(calculate, 'value')
Kms_Driven.observe(calculate, 'value')
Owner.observe(calculate, 'value')
age.observe(calculate, 'value')
fuel_type.observe(calculate, 'value')
seller_type.observe(calculate, 'value')
transmission_type.observe(calculate, 'value')

ipywidgets.VBox([sheet])
