Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [98]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

# Imports preprocessing functions
%run ./jlu_preprocessing.ipynb

## Functions

#### Algorithms

In [2]:
def tuneParams(data, labels):
    N = 5
    params = np.arange(0.1,1.1,0.1)
    scores = []
    
    for param in params:
        model = Lasso(alpha=param, selection='random') 
        cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
        scores.append(np.mean(cv['test_score']))
    
    plt.plot(scores)
    plt.show()

In [3]:
def crossVal(model, data, labels):
    cv = cross_validate(model, data, labels, cv=5, n_jobs=-1, error_score='raise')
    
    plt.plot(cv['test_score'])
    plt.show()

In [4]:
def predictPrice(X_train, X_test, Y_train, Y_test):
# - - - - Models - - - -
    model = LinearRegression()
    # model = Lasso()
    # model = ElasticNet()
    
# - - - - Cross Validation - - - -
    # cv = crossVal(model, X_train, Y_train)
    
# - - - - Fit and Predict - - - -
    clf = model.fit(X_train, Y_train)
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

In [35]:
def cleanFeatures(df):
    
    price_col = df["price"]
    
    # Columns to be removed can be adjusted
    id_cols = ["vin", "sp_id", "listing_id", "trimId"]
    str_cols = ["description", "major_options", "main_picture_url", "city",
                "listed_date", "dealer_zip", "listing_color", "fuel_type", "franchise_dealer"]
    int_cols = ["latitude", "longitude", "price"]
    
    for cols_to_drop in [id_cols, str_cols, int_cols]:
        df = df.drop(columns=cols_to_drop, axis=1)
    
    return df, price_col

In [87]:
def cleanCarRow(df, normalizer):
    df["year"] = processYears(df)
    #df = df.dropna(axis=1)

    for c in df.columns:
        clear_output(wait=True)
        print(c)

        
        item = df[c].dropna(axis=0).tolist()
        
        if(not item):
            continue
        if type(item[0]) is str:
            if re.search('((\d{1,}.\d{1,}))(in)((--)?)', item[0]) is not None:
                    df[c] = convertMeasurementStrings(df, c)
            else:
                df[c] = encoder.fit_transform(df[c].to_numpy().reshape(-1,1))
        elif type(item[0]) is bool:
            df[c] = encoder.fit_transform(df[c].to_numpy().reshape(-1,1))
        
        #df[c] = normalizer.fit_transform(df[c].to_numpy().reshape(-1,1))
    
    return df

## Code

In [5]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  147.29619526863098


In [95]:
# Sample a portion of the rows

PERCENT = 0.5

begin = time.time()
sampled_df = original_df.sample(frac=PERCENT, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

Time taken:  4.153172016143799


#### Get Random Car

In [99]:
df, price_col = cleanColumnsAndRows(sampled_df)

(30000, 56)


#### Standardize columns of differing data types

In [101]:
df = convertColumnsToNumericalAndStandardize(df, standardizer)
df.head()
print(df.size)

year
1200000


#### Run algorithm

In [102]:
X_train, X_test, Y_train, Y_test = train_test_split(df, price_col, test_size=0.2)

In [11]:
# tuneParams(X_train, Y_train)

In [105]:
# model 1 Linear Regression

model_1 = LinearRegression()
# model = Lasso()
# model = ElasticNet()

# - - - - Cross Validation - - - -
# cv = crossVal(model, X_train, Y_train)

# - - - - Fit and Predict - - - -
clf = model_1.fit(X_train, Y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

print("Train MSE: ", mean_squared_error(Y_train, train_pred))
print("Test MSE: ", mean_squared_error(Y_test, test_pred))

In [None]:
# model 2
model_2 = Lasso()

# - - - - Cross Validation - - - -
# cv = crossVal(model, X_train, Y_train)

# - - - - Fit and Predict - - - -
clf = model_2.fit(X_train, Y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
print("Train MSE: ", mean_squared_error(Y_train, train_pred))
print("Test MSE: ", mean_squared_error(Y_test, test_pred))

In [None]:
# model 3
model_3 = ElasticNet()

# - - - - Cross Validation - - - -
# cv = crossVal(model, X_train, Y_train)

# - - - - Fit and Predict - - - -
clf = model_3.fit(X_train, Y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
print("Train MSE: ", mean_squared_error(Y_train, train_pred))
print("Test MSE: ", mean_squared_error(Y_test, test_pred))

In [None]:
# model 4

regr = Ridge(alpha=100).fit(X_train, Y_train)
    
train_pred = regr.predict(X_train)
test_pred = regr.predict(X_test)
print("Train MSE: ", mean_squared_error(Y_train, train_pred))
print("Test MSE: ", mean_squared_error(Y_test, test_pred))

In [None]:
# model 5

In [None]:
# model 6

In [12]:
predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  163355780.02483574
Test MSE:  171922106.46942857


#### Get Random Car

In [56]:
randomCar = original_df.sample(n=1,ignore_index=True)
print(randomCar.iloc[0])




vin                     1C6RREFGXLN333769
back_legroom                      45.2 in
bed                                   NaN
bed_height                             --
bed_length                        67.4 in
                              ...        
wheel_system                          4X2
wheel_system_display                  4X2
wheelbase                        144.6 in
width                             82.1 in
year                                 2020
Name: 2853669, Length: 66, dtype: object


#### Clean Features and Normalize

In [88]:
cleanCar, price = cleanFeatures(randomCar)
cleanCar = convertColumnsToNumericalAndStandardize(cleanCar, standardizer)

print(cleanCar.iloc[0])

year
back_legroom               0.0
bed                        NaN
bed_height                 0.0
bed_length                 0.0
body_type                  0.0
cabin                      NaN
city_fuel_economy          0.0
combine_fuel_economy       NaN
daysonmarket               0.0
engine_cylinders           0.0
engine_displacement        0.0
engine_type                0.0
exterior_color             0.0
fleet                      NaN
frame_damaged              NaN
franchise_make             0.0
front_legroom              0.0
fuel_tank_volume           0.0
has_accidents              NaN
height                     0.0
highway_fuel_economy       0.0
horsepower                 0.0
interior_color             0.0
isCab                      NaN
is_certified               NaN
is_cpo                     NaN
is_new                     0.0
is_oemcpo                  NaN
length                     0.0
make_name                  0.0
maximum_seating            0.0
mileage                    0.0
mod

In [74]:
print(price)


2853669    34900.0
Name: price, dtype: float64


In [13]:
# Sampled 50%

# 40% threshold
# Train MSE:  161769242.97192186
# Test MSE:  141033085.0802493

# 50% threshold
# Train MSE:  148576831.2011206
# Test MSE:  194019960.8040664