In [15]:
import re
import math
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression, VarianceThreshold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder

from IPython.display import clear_output

## Preprocessing

In [None]:
NUMERICAL_STRATEGY = "mean"

In [16]:
encoder = Pipeline(steps=[
    ("encoder", OrdinalEncoder()),
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

In [17]:
standardizer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy=NUMERICAL_STRATEGY)),
    ("scaler", StandardScaler())
])

In [18]:
minMaxScaler = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy=NUMERICAL_STRATEGY)),
    ("scaler", MinMaxScaler(feature_range=(-50,50)))
])

In [27]:
def cleanColumnsAndRows(df):
    # Remove rows with duplicates
    df = df.drop_duplicates(ignore_index=True)

    # Remove columns with majority of NaN's
    size = len(df)
    thresh = math.floor(size * 0.5)
    df = df.dropna(axis=1, thresh=thresh)
    print(df.shape)
    
    price_col = df["price"]
    
    # Columns to be removed can be adjusted
    id_cols = ["vin", "sp_id", "listing_id", "trimId"]
    str_cols = ["description", "major_options", "main_picture_url", "city",
                "listed_date", "dealer_zip", "listing_color", "fuel_type", "franchise_dealer"]
    int_cols = ["latitude", "longitude", "price"]
    
    for cols_to_drop in [id_cols, str_cols, int_cols]:
        df = df.drop(columns=cols_to_drop, axis=1)
    
    return df, price_col

In [20]:
# def filterThresholdVariance(df):
#     PERCENT = 0.8
#     vt = VarianceThreshold(threshold=(PERCENT * (1-PERCENT)))
#     df = vt.fit_transform(df)
#     return pd.DataFrame(df)

In [26]:
def convertColumnsToNumericalAndStandardize(df, normalizer):
    df["year"] = processYears(df)

    for c in df.columns:
        clear_output(wait=True)
        print(c)

        item = df[c].dropna(axis=0).tolist()[0]

        if type(item) is str:
            if re.search('((\d{1,}.\d{1,}))(in)((--)?)', item) is not None:
                df[c] = convertMeasurementStrings(df, c)
            else:
                df[c] = encoder.fit_transform(df[c].to_numpy().reshape(-1,1))
        elif type(item) is bool:
            df[c] = encoder.fit_transform(df[c].to_numpy().reshape(-1,1))
        
        df[c] = normalizer.fit_transform(df[c].to_numpy().reshape(-1,1))
    
    return df

In [22]:
def processYears(df):
    indices = []
    for index, ele in enumerate(df["year"]):
        if pd.isnull(ele):
            indices.append(index)
    
    df = df.drop(labels=indices, axis=0)

    startYear = 1995
    
    yrs = df["year"].tolist()
    yrs = list(map(lambda x: x - startYear, yrs))
    yrs = list(map(lambda x: max(x, 0), yrs))

    return yrs

In [23]:
def convertMeasurementStrings(df, label):
    arr = df[label].tolist()
    
    def transform(ele):
        if pd.isnull(ele) or re.search("\d", ele) is None:
            return "0.0"
        return float(ele.split(' ')[0])
    
    arr = list(map(transform, arr))
    arr = [float(x) for x in arr]
    mean = np.mean(arr)
    
    arr = np.array(arr).reshape(-1,1)
    arr = SimpleImputer(strategy="constant", fill_value=mean, missing_values=0.0).fit_transform(arr)
    
    return arr