Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [23]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

# Imports preprocessing functions
%run ./jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [2]:
def tuneParams(data, labels):
    N = 5
    params = np.arange(0.1,1.1,0.1)
    scores = []
    
    for param in params:
        model = Lasso(alpha=param, selection='random') 
        cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
        scores.append(np.mean(cv['test_score']))
    
    plt.plot(scores)
    plt.show()

In [3]:
def crossVal(model, data, labels):
    cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
    
    plt.plot(cv['test_score'])
    plt.show()

In [4]:
def predictPrice(X_train, X_test, Y_train, Y_test):
# - - - - Models - - - -
    model = LinearRegression()
    # model = Lasso()
    # model = ElasticNet()
    
# - - - - Cross Validation - - - -
    # cv = crossVal(model, X_train, Y_train)
    
# - - - - Fit and Predict - - - -
    clf = model.fit(X_train, Y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [5]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  203.3028380870819


In [6]:
# Sample a portion of the rows

PERCENT = 0.50

begin = time.time()
sampled_df = original_df.sample(frac=PERCENT, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

Time taken:  19.401051998138428


In [24]:
df = sampled_df

print(df.shape)
df.head()

(1500020, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,3TMCZ5AN9HM116493,32.6 in,,--,60.5 in,Pickup Truck,,Grimes,,,...,,,t68366,TRD Sport V6 Double Cab 4WD,,4WD,Four-Wheel Drive,127.4 in,75.2 in,2017
1,3C4PDCAB7JT526800,36.1 in,,,,SUV / Crossover,,Brunswick,19.0,,...,A,4-Speed Automatic,t75441,SE FWD,,FWD,Front-Wheel Drive,113.8 in,83.7 in,2018
2,3N1CN8EV5LL871682,31 in,,,,Sedan,,Avondale,32.0,,...,CVT,Continuously Variable Transmission,t88093,SV FWD,,FWD,Front-Wheel Drive,103.1 in,68.5 in,2020
3,3FA6P0HD3LR249531,38.3 in,,,,Sedan,,Columbus,23.0,,...,A,Automatic,t86414,SE FWD,,FWD,Front-Wheel Drive,112.2 in,83.5 in,2020
4,1GCGSCEN1M1117102,35.8 in,,--,61.7 in,Pickup Truck,,Daphne,18.0,,...,A,8-Speed Automatic,t91842,LT Crew Cab RWD,,4X2,4X2,128.3 in,83.9 in,2021


#### Initial cleaning

In [25]:
df, price_col = cleanColumnsAndRows(df)

(1500013, 56)


#### Standardize columns of differing data types

In [26]:
df = convertColumnsToNumericalAndStandardize(df, standardizer)
df.head()

year


Unnamed: 0,back_legroom,body_type,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,engine_type,exterior_color,fleet,frame_damaged,...,theft_title,torque,transmission,transmission_display,trim_name,wheel_system,wheel_system_display,wheelbase,width,year
0,-1.356553,-0.707759,-4.404494e-16,-0.625936,0.883542,0.405862,0.883542,-1.312191,-0.35408,-0.071068,...,-0.042061,0.037221,-0.401412,0.653256,1.085045,-1.760499,-0.279705,1.001458,-0.332716,-0.203927
1,-0.405871,0.095953,-0.4582689,-0.515457,-0.801336,-0.434321,-0.801336,1.381573,-0.35408,-0.071068,...,-0.042061,-1.226279,-0.401412,-2.878932,0.507286,0.676659,0.702916,-0.025056,0.983645,0.061427
2,-1.79115,0.899666,1.153413,-0.009093,-0.801336,-1.045362,-0.801336,-0.332641,-0.35408,-0.071068,...,-0.042061,-1.768578,1.52345,0.770996,0.842008,0.676659,0.702916,-1.002688,-1.382671,0.592136
3,0.1917,0.899666,0.03763309,-0.45101,-0.801336,-0.35794,-0.801336,-1.508836,-0.35408,-0.071068,...,-0.042061,-0.97097,-0.401412,0.653256,0.507286,0.676659,0.702916,-0.171701,0.967974,0.592136
4,-0.487358,-0.707759,-0.5822444,-0.53387,0.883542,0.482242,0.883542,0.977622,-0.35408,-0.071068,...,-0.042061,0.182579,-0.401412,-0.288661,-0.113648,-0.948113,-2.244948,1.040563,1.014987,0.85749


#### Run algorithm

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(df, price_col, test_size=0.2)

In [28]:
# tuneParams(X_train, Y_train)

In [29]:
predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  148576831.2011206
Test MSE:  194019960.8040664


In [30]:
# Sampled 50%

# 40% threshold
# Train MSE:  161769242.97192186
# Test MSE:  141033085.0802493

# 50% threshold
# Train MSE:  148576831.2011206
# Test MSE:  194019960.8040664