In [24]:
import math
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from IPython.display import clear_output

## Preprocessing

In [16]:
encoder = Pipeline(steps=[
    ("encoder", OrdinalEncoder()),
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

In [17]:
standardizer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [18]:
def cleanColumnsAndRows(df):
    # Remove rows with duplicate vehicle id numbers
    df = df.drop_duplicates(subset=["vin"], ignore_index=True)

    # Remove columns with majority of NaN's
    size = len(df)
    thresh = math.floor(size * 0.7)
    df = df.dropna(axis=1, thresh=thresh)
    print(df.shape)
    
    return df

In [33]:
def convertColumnsToNumericalAndStandardize(df):
    price_col = df["price"]
    
    # Columns to be removed can be adjusted
    cols_to_drop = ["vin", "back_legroom", "description", "dealer_zip", "front_legroom", 
                    "latitude", "longitude", "length", "listed_date","major_options", 
                    "maximum_seating", "power", "torque", "main_picture_url", "trimId", 
                    "listing_id", "sp_id", "price", "wheelbase", "width"]
    
    df = df.drop(columns=cols_to_drop, axis=1)

    for c in df.columns:
        clear_output(wait=True)
        print(c)

        item = df[c].dropna(axis=0).tolist()[0]

        if (type(item) is str) or (type(item) is bool):
            arr = np.array(df[c].tolist()).reshape(-1,1)
            df[c] = encoder.fit_transform(arr)

        arr = np.array(df[c].tolist()).reshape(-1,1)
        df[c] = standardizer.fit_transform(arr)
    
    return df, price_col

In [30]:
def processYears(df):
    indices = []
    for index, ele in enumerate(df["year"]):
        if pd.isnull(ele):
            indices.append(index)
    
    df = df.drop(labels=indices, axis=0)

    # These variables can be tweaked
    startYear = 1950
    endYear = 2010
    
    yrs = df["year"].tolist()
    yrs = list(map(lambda x: x - startYear, yrs))
    yrs = list(map(lambda x: max(x, 0), yrs))

    return yrs