Source: https://github.com/krishnaik06/Advanced-House-Price-Prediction-/blob/master/Feature%20Engineering.ipynb

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Functions

#### Preprocessing

In [2]:
def fillWithMean(col):
    c = col.dropna(axis=0).tolist()
    mean = np.mean(c)
    
    return [mean if pd.isnull(x) else x for x in col]

In [3]:
def fillWithMode(col):
    col = col.tolist()
    occ = dict()
    
    for val in col:
        occ[val] = 0
        
    for val in col:
        occ[val] += 1
        
    occ = list(sorted(occ.items(), key=lambda x: x[1], reverse=True))
        
    return [occ[0][0] if pd.isnull(x) else x for x in col]

In [4]:
def convertBool(col):
    col = col.tolist()
    for index in range(len(col)):
        if pd.isnull(col[index]) or not col[index]:
            col[index] = 0
        else:
            col[index] = 1

    return col

In [5]:
def rescaleNumericalValues(col):
    col = np.array(col.tolist()).reshape(-1,1)
    ss = StandardScaler()
    return ss.fit_transform(col).reshape(-1)

In [6]:
def encodeCategoricalValues(col):
    col = col.tolist()
    
    mapping = dict()
    s = set(col)
    
    i = 0
    for val in s:
        mapping[val] = i
        i += 1
    
    l = []
    
    for val in col:
        l.append(mapping[val])
    
    return l

In [7]:
def processYears(yrs):
    # These variables can be tweaked
    startYear = 1960
    endYear = 2010
    
    yrs = df["year"].tolist()
    yrs = list(map(lambda x: x - startYear, yrs))
    yrs = list(map(lambda x: max(x, 0), yrs))

    return yrs

#### Algorithms

In [8]:
def predictPrice(X_train, X_test, Y_train, Y_test):
    linReg = LinearRegression().fit(X_train, Y_train)
    
    train_pred = linReg.predict(X_train)
    test_pred = linReg.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [9]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  187.26581501960754


In [46]:
# Sample a portion of the rows

begin = time.time()
df = original_df.sample(frac=0.30, replace=False, ignore_index=True)
end = time.time()

print("Time taken: ", end - begin)

print(df.shape)
df.head()

Time taken:  618.5535287857056
(900012, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
0,2FMHK6D80KBA13375,44.3 in,,,,SUV / Crossover,,City Of Industry,16.0,,...,A,Automatic,t78248,Limited AWD,,AWD,All-Wheel Drive,117.9 in,88.8 in,2019
1,1FMSK7DH9LGC48217,39 in,,,,SUV / Crossover,,Sunnyvale,21.0,,...,A,Automatic,t84880,XLT RWD,,RWD,Rear-Wheel Drive,119.1 in,89.3 in,2020
2,1FADP3N24DL133685,33.2 in,,,,Hatchback,,Vacaville,26.0,,...,M,5-Speed Manual,t44808,Titanium Hatchback,,FWD,Front-Wheel Drive,104.3 in,71.8 in,2013
3,JF2GPADC5HH242530,35.4 in,,,,SUV / Crossover,,Butte,26.0,,...,CVT,Continuously Variable Transmission,t69670,Premium,,AWD,All-Wheel Drive,103.7 in,78.9 in,2017
4,1GNSCTKL8MR114488,42 in,,,,SUV / Crossover,,Mcallen,15.0,,...,A,Automatic,t91866,High Country RWD,,4X2,4X2,120.9 in,81 in,2021


#### Initial cleaning

In [47]:
# Remove rows with duplicate vehicle id numbers

df.drop_duplicates(subset=["vin"], inplace=True)
print(df.shape)

(900009, 66)


In [49]:
# Take subset of columns
cols_subset = ["engine_type", "fuel_type", "model_name", "wheel_system", "horsepower", "price"]
df = df[cols_subset]
print(df.shape)
df.head()

(900009, 6)


Unnamed: 0,engine_type,fuel_type,model_name,wheel_system,horsepower,price
0,V6,Gasoline,Flex,AWD,287.0,23995.0
1,I4,Gasoline,Explorer,RWD,300.0,42310.0
2,I4 Flex Fuel Vehicle,Flex Fuel Vehicle,Focus,FWD,160.0,11994.0
3,H4,Gasoline,Crosstrek,AWD,148.0,23995.0
4,V8,Gasoline,Tahoe,4X2,420.0,78094.0


In [50]:
# Remove columns with all NaN's
df = df.dropna(axis=1, how='all')
print(df.shape)
df.head()

(900009, 6)


Unnamed: 0,engine_type,fuel_type,model_name,wheel_system,horsepower,price
0,V6,Gasoline,Flex,AWD,287.0,23995.0
1,I4,Gasoline,Explorer,RWD,300.0,42310.0
2,I4 Flex Fuel Vehicle,Flex Fuel Vehicle,Focus,FWD,160.0,11994.0
3,H4,Gasoline,Crosstrek,AWD,148.0,23995.0
4,V8,Gasoline,Tahoe,4X2,420.0,78094.0


#### Process columns with years

In [51]:
if "year" in df.columns:
    indices = []
    for index, ele in enumerate(df["year"]):
        if pd.isnull(ele):
            indices.append(index)

    print(df.shape)
    df = df.drop(labels=indices, axis=0)
    print(df.shape)

    df["year"] = processYears(df["year"])

    df.head()

#### Standardize columns of differing data types

In [52]:
for c in df.columns[:-1]:
    item = df[c].dropna(axis=0).tolist()[0]
    
    if type(item) is str:
        df[c] = fillWithMode(df[c])
        df[c] = encodeCategoricalValues(df[c])
    elif type(item) is bool:
        df[c] = convertBool(df[c])
    
    df[c] = fillWithMean(df[c])
    df[c] = rescaleNumericalValues(df[c])

df.head()

Unnamed: 0,engine_type,fuel_type,model_name,wheel_system,horsepower,price
0,1.344867,0.194912,0.729072,1.354393,0.443772,23995.0
1,-0.903425,0.194912,-0.98787,0.187116,0.591704,42310.0
2,-0.699035,-1.103901,0.067364,-0.98016,-1.00141,11994.0
3,1.004217,0.194912,1.376204,1.354393,-1.137962,23995.0
4,0.799827,0.194912,-0.608919,-0.396522,1.95723,78094.0


#### Run algorithm

In [53]:
cols = df.columns
X_columns = cols[:-1]
Y_column = cols[-1]

X_train, X_test, Y_train, Y_test = train_test_split(df[X_columns].to_numpy(), df[Y_column].to_numpy(), test_size=0.2)

predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  277562544.3588832
Test MSE:  259759581.85332206
