Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
# Imports preprocessing functions
%run ./v1_jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [2]:
def predictPrice(X_train, X_test, Y_train, Y_test):
    regr = SVR().fit(X_train, Y_train)
    #regr = Ridge(alpha=100).fit(X_train, Y_train)
    
    train_pred = regr.predict(X_train)
    test_pred = regr.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [3]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  84.75465965270996


In [4]:
# Sample a portion of the rows

begin = time.time()
df = original_df.sample(frac=0.30, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

print(df.shape)
df.head()

Time taken:  1.5143764019012451
(900012, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,1GCHSBEA8G1388111,28.6 in,,--,74 in,Pickup Truck,,Blue Springs,20.0,,...,A,6-Speed Automatic,t58421,Work Truck Extended Cab LB RWD,,4X2,4X2,128.3 in,83.9 in,2016
1,4S3BNAH6XH3041500,38.1 in,,,,Sedan,,Accident,25.0,,...,A,Automatic,t68623,2.5i Premium,,AWD,All-Wheel Drive,108.3 in,81.3 in,2017
2,JTHBK1GG3H2251251,40 in,,,,Sedan,,Ramsey,21.0,,...,A,Automatic,t69510,350 FWD,,FWD,Front-Wheel Drive,111 in,71.7 in,2017
3,3CZRU5H35LM716248,39.3 in,,,,SUV / Crossover,,El Paso,28.0,,...,A,Automatic,t89656,LX FWD,,FWD,Front-Wheel Drive,102.8 in,69.8 in,2020
4,1N6ED0EA5LN703724,33.6 in,,--,59.5 in,Pickup Truck,,Mcallen,,,...,A,9-Speed Automatic,t94251,SV Crew Cab RWD,,4X2,4X2,125.9 in,72.8 in,2020


#### Initial cleaning

In [5]:
df= cleanColumnsAndRows(df)

(900008, 50)


In [6]:
type(df)

pandas.core.frame.DataFrame

#### Process columns with years

In [7]:
df["year"] = processYears(df)
df.head()

Unnamed: 0,vin,back_legroom,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,...,torque,transmission,transmission_display,trimId,trim_name,wheel_system,wheel_system_display,wheelbase,width,year
0,1GCHSBEA8G1388111,28.6 in,Pickup Truck,Blue Springs,20.0,2,64015,Recent Arrival! 2016 Chevrolet Colorado Work T...,I4,2800.0,...,"269 lb-ft @ 4,300 RPM",A,6-Speed Automatic,t58421,Work Truck Extended Cab LB RWD,4X2,4X2,128.3 in,83.9 in,66
1,4S3BNAH6XH3041500,38.1 in,Sedan,Accident,25.0,14,21520,CARFAX 1-Owner! No Accidents! Premium! AWD! CV...,H4,2500.0,...,"174 lb-ft @ 4,000 RPM",A,Automatic,t68623,2.5i Premium,AWD,All-Wheel Drive,108.3 in,81.3 in,67
2,JTHBK1GG3H2251251,40 in,Sedan,Ramsey,21.0,22,7446,"CARFAX 1-Owner, GREAT MILES 33,428! FUEL EFFIC...",V6,3500.0,...,"248 lb-ft @ 4,700 RPM",A,Automatic,t69510,350 FWD,FWD,Front-Wheel Drive,111 in,71.7 in,67
3,3CZRU5H35LM716248,39.3 in,SUV / Crossover,El Paso,28.0,28,79936,,I4,1800.0,...,"127 lb-ft @ 4,300 RPM",A,Automatic,t89656,LX FWD,FWD,Front-Wheel Drive,102.8 in,69.8 in,70
4,1N6ED0EA5LN703724,33.6 in,Pickup Truck,Mcallen,,49,78501,New Price! Front Bucket Seats 4-Way Driver Sea...,V6,3800.0,...,"281 lb-ft @ 4,400 RPM",A,9-Speed Automatic,t94251,SV Crew Cab RWD,4X2,4X2,125.9 in,72.8 in,70


In [8]:
df['year']

0         66
1         67
2         67
3         70
4         70
          ..
900003    70
900004    64
900005    68
900006    68
900007    64
Name: year, Length: 900008, dtype: int64

In [9]:
scaler = StandardScaler()

#### Standardize columns of differing data types

In [10]:
df,price_col = convertColumnsToNumericalAndStandardize(df)
df.head()

year


Unnamed: 0,body_type,city,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,engine_type,exterior_color,franchise_dealer,franchise_make,...,model_name,savings_amount,seller_rating,sp_name,transmission,transmission_display,trim_name,wheel_system,wheel_system_display,year
0,-0.705817,-1.433385,-0.3350822,-0.679069,-0.845608,-0.127969,-0.845608,1.340238,-2.066648,1.376852,...,-0.905356,-0.338186,0.180639,1.54295,-0.389458,-1.616276,1.244783,-0.925472,-2.041043,-0.416608
1,0.864884,-1.704765,0.2909043,-0.568963,-1.332966,-0.357235,-1.332966,1.322717,-2.066648,1.376852,...,0.13524,0.799861,1.431934,-1.60957,-0.389458,0.624078,-1.811657,-0.193105,-1.184062,-0.17602
2,0.864884,0.839133,-0.2098849,-0.495559,0.713936,0.406986,0.713936,-0.680003,0.483875,-0.047719,...,-0.65468,-0.05721,-0.375409,0.857245,-0.389458,0.624078,-1.46257,0.539261,0.5299,-0.17602
3,0.079533,-0.811441,0.6664962,-0.440506,-0.845608,-0.892191,-0.845608,-0.117555,0.483875,-0.565745,...,-0.097947,-0.519217,0.871289,-0.63219,-0.389458,0.624078,-0.112492,0.539261,0.5299,0.545745
4,-0.705817,0.165681,4.447902e-16,-0.247821,0.713936,0.636253,0.713936,-0.339341,0.483875,0.53506,...,-0.319474,-0.519217,-1.737867,-1.348955,-0.389458,0.388251,0.763567,-0.925472,-2.041043,0.545745


#### Run algorithm

In [11]:
# X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

# predictPrice(X_train, X_test, Y_train, Y_test)

#LinReg Result

In [12]:
# X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

# predictPrice(X_train, X_test, Y_train, Y_test)

# #SVR Result

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

In [14]:
# param_grid = {'alpha':[1.5,2,2.5,3,3.5,4,10,20,30,40,50,100], 'solver':['auto', 'svd']}

In [15]:
# grid =  GridSearchCV(predictPrice(X_train, X_test, Y_train, Y_test), param_grid)

In [16]:
# grid.fit(X_train, Y_train)

In [17]:
# print(grid.best_params_)
# grid_predictions = grid.predict(X_test)

In [None]:
predictPrice(X_train, X_test, Y_train, Y_test)