Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
# Imports preprocessing functions
%run ./v1_jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [38]:
def predictPrice(X_train, X_test, Y_train, Y_test):
    #regr = SVR().fit(X_train, Y_train)
    #regr = Ridge(alpha=100).fit(X_train, Y_train)
    regr = SGDRegressor(max_iter=10000, learning_rate='adaptive',n_iter_no_change=100, alpha=0.0002).fit(X_train,Y_train)
    
    train_pred = regr.predict(X_train)
    test_pred = regr.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

    print("Train r2: ", r2_score(Y_train, train_pred))
    print("Test r2: ", r2_score(Y_test, test_pred))

## Code

In [3]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  78.71398568153381


In [4]:
# Sample a portion of the rows

begin = time.time()
df = original_df.sample(frac=0.30, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

print(df.shape)
df.head()

Time taken:  1.8219010829925537
(900012, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,ZACCJABB7JPH68486,35.1 in,,,,SUV / Crossover,,San Antonio,22.0,,...,A,9-Speed Automatic,t76091,Altitude,,FWD,Front-Wheel Drive,101.2 in,79.6 in,2018
1,2T1BURHE0KC240613,41.4 in,,,,Sedan,,Los Angeles,28.0,,...,A,Automatic,t78889,LE,,FWD,Front-Wheel Drive,106.3 in,69.9 in,2019
2,3C4NJCBB1LT257033,38.3 in,,,,SUV / Crossover,,Beaumont,22.0,,...,A,Automatic,t91747,Sun and Safety Edition FWD,,FWD,Front-Wheel Drive,103.8 in,80 in,2020
3,4S4BSADC6G3359872,38.1 in,,,,Wagon,,Sterling,25.0,,...,CVT,Continuously Variable Transmission,t58700,2.5i Premium,,AWD,All-Wheel Drive,108.1 in,81.3 in,2016
4,1C4RJFBG4LC423667,38.6 in,,,,SUV / Crossover,,Marysville,18.0,,...,A,8-Speed Automatic,t87086,Limited 4WD,,4WD,Four-Wheel Drive,114.7 in,84.8 in,2020


#### Initial cleaning

In [5]:
df= cleanColumnsAndRows(df)

(900008, 50)


In [6]:
type(df)

pandas.core.frame.DataFrame

#### Process columns with years

In [7]:
df["year"] = processYears(df)
df.head()

Unnamed: 0,vin,back_legroom,body_type,city,city_fuel_economy,daysonmarket,dealer_zip,description,engine_cylinders,engine_displacement,...,torque,transmission,transmission_display,trimId,trim_name,wheel_system,wheel_system_display,wheelbase,width,year
0,ZACCJABB7JPH68486,35.1 in,SUV / Crossover,San Antonio,22.0,41,78230,"115V Auxiliary Power Outlet, 3.734 Final Drive...",I4 Flex Fuel Vehicle,2400.0,...,"175 lb-ft @ 3,900 RPM",A,9-Speed Automatic,t76091,Altitude,FWD,Front-Wheel Drive,101.2 in,79.6 in,68
1,2T1BURHE0KC240613,41.4 in,Sedan,Los Angeles,28.0,27,90045,Test drive your selected car on your own for u...,I4,1800.0,...,"128 lb-ft @ 4,400 RPM",A,Automatic,t78889,LE,FWD,Front-Wheel Drive,106.3 in,69.9 in,69
2,3C4NJCBB1LT257033,38.3 in,SUV / Crossover,Beaumont,22.0,49,77701,Contact Mike Smith Plex Chrysler Jeep Dodge to...,I4,2400.0,...,"175 lb-ft @ 3,900 RPM",A,Automatic,t91747,Sun and Safety Edition FWD,FWD,Front-Wheel Drive,103.8 in,80 in,70
3,4S4BSADC6G3359872,38.1 in,Wagon,Sterling,25.0,25,20166,"2016 Subaru Outback 2.5i Premium AWD Awd, Powe...",H4,2500.0,...,"174 lb-ft @ 4,000 RPM",CVT,Continuously Variable Transmission,t58700,2.5i Premium,AWD,All-Wheel Drive,108.1 in,81.3 in,66
4,1C4RJFBG4LC423667,38.6 in,SUV / Crossover,Marysville,18.0,36,43040,Diamond Black Crystal Pearlcoat 2020 Jeep Gran...,V6,3600.0,...,"390 lb-ft @ 4,250 RPM",A,8-Speed Automatic,t87086,Limited 4WD,4WD,Four-Wheel Drive,114.7 in,84.8 in,70


In [8]:
df['year']

0         68
1         69
2         70
3         66
4         70
          ..
900003    70
900004    69
900005    69
900006    70
900007    68
Name: year, Length: 900008, dtype: int64

In [9]:
scaler = StandardScaler()

#### Standardize columns of differing data types

In [10]:
df,price_col = convertColumnsToNumericalAndStandardize(df)
df.head()

year


Unnamed: 0,body_type,city,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,engine_type,exterior_color,franchise_dealer,franchise_make,...,model_name,savings_amount,seller_rating,sp_name,transmission,transmission_display,trim_name,wheel_system,wheel_system_display,year
0,0.080768,1.027448,-0.084482,-0.32149,-0.532415,-0.436033,-0.532415,-1.639907,-2.066417,1.377022,...,0.892439,-0.344557,0.579296,0.031306,-0.389599,0.38937,-1.05033,0.539362,0.52964,0.065236
1,0.865506,0.020708,0.660682,-0.450482,-0.847658,-0.893853,-0.847658,-0.976211,-2.066417,1.377022,...,-0.853416,1.671806,0.066414,-1.449475,-0.389599,0.62538,-0.236434,0.539362,0.52964,0.307066
2,0.080768,-1.525273,-0.084482,-0.24778,-0.847658,-0.436033,-0.847658,0.470816,0.48393,-0.307229,...,-0.899818,-0.512975,-0.174485,0.48337,-0.389599,0.62538,0.922704,0.539362,0.52964,0.548897
3,2.434982,1.285883,0.2881,-0.46891,-1.373061,-0.35973,-1.373061,-0.255045,0.48393,1.117907,...,0.500927,-0.252439,-0.531948,1.293064,0.937465,0.743385,-1.813229,-0.19255,-1.183933,-0.418425
4,0.080768,0.125625,-0.581258,-0.367559,0.728553,0.479607,0.728553,-0.719499,0.48393,-0.307229,...,-0.151594,-0.512975,-0.078245,-0.88981,-0.389599,-0.200654,-0.009502,-1.656374,-0.327147,0.548897


#### Run algorithm

In [11]:
# X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

# predictPrice(X_train, X_test, Y_train, Y_test)

#LinReg Result

In [12]:
# X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

# predictPrice(X_train, X_test, Y_train, Y_test)

# #SVR Result

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(df.to_numpy(), price_col.to_numpy(), test_size=0.2)

In [14]:
# param_grid = {'alpha':[1.5,2,2.5,3,3.5,4,10,20,30,40,50,100], 'solver':['auto', 'svd']}

In [15]:
# grid =  GridSearchCV(predictPrice(X_train, X_test, Y_train, Y_test), param_grid)

In [16]:
# grid.fit(X_train, Y_train)

In [None]:
# print(grid.best_params_)
# grid_predictions = grid.predict(X_test)

In [39]:
predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  177722749.78835636
Test MSE:  147071172.44254407
Train r2:  0.5509364097941885
Test r2:  0.5978825080203756
