In [85]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('scrap_price.csv')

drop_cols = ['ID', 'aspiration', 'enginelocation', 'enginetype', 'fuelsystem']
price_col = ['price']
df.drop(columns=drop_cols, inplace=True)

In [86]:
# Function to print object cols
def obj_cols(df, print = False) :
    obj_cols = df.select_dtypes('object').columns.to_list()
    if print :
        for col in obj_cols :
            print (col, " : ", df[col].unique())
    return list(set(obj_cols) - set(price_col))

def num_cols(df, print = False) :
    return list(set(df.columns) - set(obj_cols(df)) - set(price_col))

In [87]:
# Assuming these factors dont affect price
# Note : another wa
drop_cols = ['ID', 'aspiration', 'enginelocation', 'enginetype', 'fuelsystem']
# Transform name -> Manufacturer, which is first substring
df['name'] = df['name'].apply(lambda x : str(x).split()[0])

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Convert 'cylinder' to number as directly propotional to cost
def numstr_to_int(string) :
    if string == 'twelve' :
        return 12
    entries = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight']
    return int(entries.index(string))

df['cylindernumber'] = df['cylindernumber'].apply(numstr_to_int)

def custom_name(feature, category) :
    return str(feature) + "_" + str(category)
# Convert other categorical data to binary encoding
categ_pipeline = Pipeline(steps= [
                            ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
                                ]
                        )

from sklearn.preprocessing import FunctionTransformer
# Log transform since there are right skewed data
def log_transform(x) :
    return np.log(1+x)

LogTransformer = FunctionTransformer(func=log_transform,
                                     feature_names_out='one-to-one')
# Scale numeric data
num_pipeline = Pipeline(steps = [
                            ('scale', MinMaxScaler()),
                            ('log_transform', LogTransformer)
                                ]
                        )

In [89]:
# Drop outliers
outlier_threshold = 3
for col in num_cols(df):
    if col == 'price' :
        continue
    Q3, Q1 = df[col].quantile(0.75), df[col].quantile(0.25)
    IQR = Q3 - Q1
    threshold = 1.5
    outlier = df[(df[col] < Q1 - threshold * IQR) | (df[col] > Q3 + threshold * IQR)]
    pc_outlier = len(outlier) *100 / len(df)
    # print (col, " : ", pc_outlier)
    if pc_outlier < outlier_threshold and pc_outlier > 0 :
        df = df.drop(outlier.index)

In [90]:
# Apply column transformers
from sklearn.compose import ColumnTransformer
col_trans = ColumnTransformer(transformers = [
    ('passthru_price', 'passthrough', ['price']),
    ('num_pipeline', num_pipeline, num_cols(df)),
    ('categ_pipeline', categ_pipeline, obj_cols(df))
])
df1 = pd.DataFrame(col_trans.fit_transform(df), columns=col_trans.get_feature_names_out())
print(df1.columns)

(194, 21)  and  (194, 56)
Index(['passthru_price__price', 'num_pipeline__carheight',
       'num_pipeline__carwidth', 'num_pipeline__wheelbase',
       'num_pipeline__citympg', 'num_pipeline__cylindernumber',
       'num_pipeline__highwaympg', 'num_pipeline__carlength',
       'num_pipeline__enginesize', 'num_pipeline__compressionratio',
       'num_pipeline__boreratio', 'num_pipeline__stroke',
       'num_pipeline__curbweight', 'num_pipeline__horsepower',
       'num_pipeline__symboling', 'num_pipeline__peakrpm',
       'categ_pipeline__carbody_convertible',
       'categ_pipeline__carbody_hardtop', 'categ_pipeline__carbody_hatchback',
       'categ_pipeline__carbody_sedan', 'categ_pipeline__carbody_wagon',
       'categ_pipeline__name_Nissan', 'categ_pipeline__name_alfa-romero',
       'categ_pipeline__name_audi', 'categ_pipeline__name_bmw',
       'categ_pipeline__name_buick', 'categ_pipeline__name_chevrolet',
       'categ_pipeline__name_dodge', 'categ_pipeline__name_honda',
      

In [91]:
import matplotlib.pyplot as plt
def hist_plot(df) :
    for i in num_cols(df):
        plt.figure()
        # print(df[i].unique())
        plt.hist(df[i])
        plt.legend([i])
df1.to_csv('scrap_price_cleaned.csv')