Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [59]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Imports preprocessing functions
%run ./jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [2]:
def predictPrice(X_train, X_test, Y_train, Y_test):
    linReg = LinearRegression().fit(X_train, Y_train)
    
    train_pred = linReg.predict(X_train)
    test_pred = linReg.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [3]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  147.53482699394226


In [55]:
# Sample a portion of the rows

begin = time.time()
df = original_df.sample(frac=0.30, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

print(df.shape)
df.head()

Time taken:  152.47291493415833
(900012, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,2FMPK4J95LBB05676,40.6 in,,,,SUV / Crossover,,Algona,21.0,,...,A,Automatic,t87981,SEL AWD,,AWD,All-Wheel Drive,112.2 in,85.8 in,2020
1,19XFC2E80LE024497,37.4 in,,,,Sedan,,Hartford,25.0,,...,M,6-Speed Manual,t88291,Sport Sedan FWD,,FWD,Front-Wheel Drive,106.3 in,70.8 in,2020
2,WDDGJ5HB7DG036950,33 in,,,,Coupe,,Plano,20.0,,...,A,Automatic,t44966,C 350 Coupe,,RWD,Rear-Wheel Drive,108.7 in,69.7 in,2013
3,1FT8X3BN4LED43748,33.5 in,,--,81.9 in,Pickup Truck,Extended Cab,Plymouth,,,...,A,Automatic,t86000,XLT SuperCab 4WD,,4WD,Four-Wheel Drive,148 in,105.9 in,2020
4,KMHD84LF0LU106068,35.7 in,,,,Sedan,,Peoria,30.0,,...,CVT,Continuously Variable Transmission,t85688,Limited Sedan FWD,,FWD,Front-Wheel Drive,106.3 in,70.9 in,2020


#### Initial cleaning

In [56]:
df = cleanColumnsAndRows(df)

(900008, 50)


#### Process columns with years

In [57]:
df["year"] = processYears(df)
df.head()

Unnamed: 0,vin,back_legroom,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,...,torque,transmission,transmission_display,trimId,trim_name,wheel_system,wheel_system_display,wheelbase,width,year
0,2FMPK4J95LBB05676,40.6 in,SUV / Crossover,Algona,21.0,5,50511,,I4,2000.0,...,"280 lb-ft @ 3,000 RPM",A,Automatic,t87981,SEL AWD,AWD,All-Wheel Drive,112.2 in,85.8 in,70
1,19XFC2E80LE024497,37.4 in,Sedan,Hartford,25.0,25,6120,"[!@@Additional Info@@!]Cloth Seat Trim,Radio: ...",I4,2000.0,...,"138 lb-ft @ 4,200 RPM",M,6-Speed Manual,t88291,Sport Sedan FWD,FWD,Front-Wheel Drive,106.3 in,70.8 in,70
2,WDDGJ5HB7DG036950,33 in,Coupe,Plano,20.0,44,75074,"Located in Plano, Texas, Plano Pre-Owned Auto ...",V6,3500.0,...,"273 lb-ft @ 3,500 RPM",A,Automatic,t44966,C 350 Coupe,RWD,Rear-Wheel Drive,108.7 in,69.7 in,63
3,1FT8X3BN4LED43748,33.5 in,Pickup Truck,Plymouth,,172,2360,"Colonial Ford of Plymouth, Our sales teams wil...",V8,7300.0,...,"430 lb-ft @ 3,800 RPM",A,Automatic,t86000,XLT SuperCab 4WD,4WD,Four-Wheel Drive,148 in,105.9 in,70
4,KMHD84LF0LU106068,35.7 in,Sedan,Peoria,30.0,51,85382,Want our best deal on this Hyundai? Don't have...,I4,2000.0,...,"132 lb-ft @ 4,500 RPM",CVT,Continuously Variable Transmission,t85688,Limited Sedan FWD,FWD,Front-Wheel Drive,106.3 in,70.9 in,70


#### Standardize columns of differing data types

In [60]:
df, price_col = convertColumnsToNumericalAndStandardize(df)
df.head()

year


Unnamed: 0,body_type,city,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,engine_type,exterior_color,franchise_dealer,franchise_make,...,model_name,savings_amount,seller_rating,sp_name,transmission,transmission_display,trim_name,wheel_system,wheel_system_display,year
0,0.078626,-1.679077,-0.2111158,-0.65104,-0.831436,-0.738657,-0.831436,-1.683763,0.484166,-0.825886,...,-0.625338,-0.51511,1.431866,0.533579,-0.39067,0.626877,0.496307,-0.194193,-1.183673,0.545981
1,0.863385,-0.415102,0.2824262,-0.468024,-0.831436,-0.738657,-0.831436,1.625607,0.484166,-0.566914,...,-0.944581,-0.51511,-0.204145,0.234136,3.589087,-1.110251,0.888673,0.538449,0.529667,0.545981
2,-3.060411,0.744057,-0.3345013,-0.294158,0.704881,0.406776,0.704881,1.456004,-2.065409,1.37538,...,-1.187675,-0.270842,-0.138705,0.80107,-0.39067,0.626877,-0.953855,1.271092,1.386337,-1.137117
3,-0.706133,0.759471,1.31506e-15,0.877146,1.253566,3.308538,1.253566,-1.660868,0.484166,-0.825886,...,-0.461323,-0.51511,0.322223,-0.9314,-0.39067,0.626877,1.4132,-1.659478,-0.327003,0.545981
4,0.863385,0.694731,0.8993536,-0.230103,-0.831436,-0.738657,-0.831436,0.987206,0.484166,-0.502171,...,-0.610694,-0.51511,-0.1744,0.171646,0.935915,0.750958,0.03246,0.538449,0.529667,0.545981


#### Run algorithm

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  162874164.6093245
Test MSE:  158103883.20438397
