Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [109]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

# Imports preprocessing functions
%run ./jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [53]:
def tuneParams(data, labels):
    N = 5
    params = np.arange(0.1,1.1,0.1)
    scores = []
    
    for param in params:
        model = Lasso(alpha=param, selection='random') 
        cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
        scores.append(np.mean(cv['test_score']))
    
    plt.plot(scores)
    plt.show()

In [54]:
def crossVal(model, data, labels):
    cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
    
    plt.plot(cv['test_score'])
    plt.show()

In [55]:
def predictPrice(X_train, X_test, Y_train, Y_test):
# - - - - Models - - - -
    model = LinearRegression()
    # model = Lasso()
    # model = ElasticNet()
    
# - - - - Cross Validation - - - -
    # cv = crossVal(model, X_train, Y_train)
    
# - - - - Fit and Predict - - - -
    clf = model.fit(X_train, Y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [56]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  179.0605068206787


In [94]:
# Sample a portion of the rows

begin = time.time()
sampled_df = original_df.sample(frac=0.40, replace=False, ignore_index=True)
end = time.time()

df = sampled_df

print("Time taken: ", end - begin)

print(df.shape)
df.head()

Time taken:  96.41422724723816
(1200016, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,1C4RJFBG2LC359631,38.6 in,,,,SUV / Crossover,,San Jose,18.0,,...,A,8-Speed Automatic,t87086,Limited 4WD,,4WD,Four-Wheel Drive,114.7 in,84.8 in,2020
1,2T3Y1RFV9LW097223,37.8 in,,,,SUV / Crossover,,Galveston,27.0,,...,A,8-Speed Automatic,t89230,Limited FWD,,FWD,Front-Wheel Drive,105.9 in,73 in,2020
2,1FMCU0F60LUB66346,40.7 in,,,,SUV / Crossover,,San Antonio,27.0,,...,A,Automatic,t86087,S FWD,,FWD,Front-Wheel Drive,106.7 in,85.6 in,2020
3,1C4RDJDG1EC552640,38.6 in,,,,SUV / Crossover,,Shelby Township,17.0,,...,A,8-Speed Automatic,t51867,Limited AWD,,AWD,All-Wheel Drive,119.8 in,85.5 in,2014
4,1FTEW1E45LFC29300,43.6 in,,--,67.1 in,Pickup Truck,,Indian Trail,16.0,,...,A,Automatic,t87739,XLT SuperCrew 4WD,,4WD,Four-Wheel Drive,145 in,96.8 in,2020


#### Initial cleaning

In [95]:
df, price_col = cleanColumnsAndRows(df)

(1200009, 50)


#### Standardize columns of differing data types

In [110]:
df = convertColumnsToNumericalAndStandardize(df)
df.head()

years


Unnamed: 0,back_legroom,body_type,city,city_fuel_economy,daysonmarket,engine_cylinders,engine_displacement,engine_type,exterior_color,franchise_make,...,savings_amount,seller_rating,sp_name,trim_name,wheel_system,wheel_system_display,wheelbase,width,year,years
0,0.286006,0.079732,1.034201,-0.583853,-0.321226,0.663735,0.483415,0.663735,-0.404221,-0.308055,...,-0.510693,-0.349278,0.631573,-0.00337,-1.657857,-0.327246,-0.126733,0.901486,0.543752,0.592686
1,0.020768,0.079732,-0.594404,0.536442,-0.6152,-0.835344,-0.357008,-0.835344,1.061154,-0.567191,...,-0.510693,0.558827,-0.957286,0.025083,0.538488,0.529386,-0.792282,-0.764015,0.543752,0.592686
2,0.982254,0.079732,1.025004,0.536442,-0.486586,-0.942421,-1.121029,-0.942421,-1.283874,-0.826327,...,-0.510693,-0.530899,0.317629,0.394004,0.538488,0.529386,-0.728132,1.001991,0.543752,0.592686
3,0.286006,0.079732,1.136132,-0.70833,-0.578453,0.663735,0.483415,0.663735,1.029404,1.376326,...,0.418359,0.94023,-0.773419,0.006275,-0.193627,-1.183878,0.242125,0.987633,-0.890444,-1.000966
4,1.94374,-0.705967,-0.270215,-0.832807,-0.523333,0.663735,0.407013,0.663735,-1.390227,-0.826327,...,-0.510693,0.122936,-0.855572,1.424572,-1.657857,-0.327246,1.196345,1.705522,0.543752,0.592686


#### Run algorithm

In [111]:
X_train, X_test, Y_train, Y_test = train_test_split(df, price_col, test_size=0.2)

In [112]:
# tuneParams(X_train, Y_train)

In [113]:
predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  168484785.3484085
Test MSE:  129426978.74264418
