In [1]:
import pandas as pd
import numpy as np

from pandas.core.dtypes.common import is_numeric_dtype, is_string_dtype

In [2]:
def get_numeric_col(df):
    numeric_col_list = []

    for col_name in df.columns:
        if is_numeric_dtype(df[col_name].dtypes):
            numeric_col_list.append(col_name)

    return numeric_col_list


def get_string_col(df):
    string_col_list = []

    for col_name in df.columns:
        if is_string_dtype(df[col_name].dtypes):
            string_col_list.append(col_name)

    return string_col_list


def outlier_iqr(df):
    numeric_col_list = get_numeric_col(df)
    total_size = df.shape[0]
    total_outlier_size = 0
    for col_name in numeric_col_list:
        q1, q3 = np.percentile(df[col_name], [25, 75])

        iqr = q3 - q1

        lower_bound = q1 - (iqr * 1.5)
        upper_bound = q3 + (iqr * 1.5)

        count = 0
        count += df[upper_bound < df[col_name]].shape[0]
        count += df[df[col_name] < lower_bound].shape[0]

        if count < total_size * 0.025:
            df = df[upper_bound >= df[col_name]]
            df = df[df[col_name] >= lower_bound]
            total_outlier_size += count

    return total_outlier_size, df

In [3]:
def check_dataset(df):
    print(df.info())
    print()
    print(f'number of "NULL" value: {df.isnull().sum().sum()}')
    df_drop_NAN = df.dropna(axis=0)
    print(f'droped row : {df.shape[0] -df_drop_NAN.shape[0]}', end='\n\n')

    num_outlier, df_drop_outlier = outlier_iqr(df_drop_NAN.copy())
    print(f'number of outlier : {num_outlier} / {df_drop_NAN.shape[0]} ---> {df_drop_NAN.shape[0] - num_outlier}')

    return df_drop_outlier

In [17]:
def makePriceToClass(df):
    df.loc[df['price']<=10000,'class']='A'
    df.loc[(df['price']>10000) & (df['price']<=20000),'class']='B'
    df.loc[(df['price']>20000) & (df['price']<=30000),'class']='C'
    df.loc[(df['price']>30000) & (df['price']<=50000),'class']='D'
    df.loc[(df['price']>50000) ,'class']='E'
    df.drop('price',axis=1)
    return df

In [4]:
df = pd.read_csv(r'.\dataset\vehicles.csv')

In [5]:
print(df.shape[0])
null_count_series = df.isnull().sum()
print(null_count_series)

426880
id                   0
url                  0
region               0
region_url           0
price                0
year              1205
manufacturer     17646
model             5277
condition       174104
cylinders       177678
fuel              3013
odometer          4400
title_status      8242
transmission      2556
VIN             161042
drive           130567
size            306361
type             92858
paint_color     130203
image_url           68
description         70
county          426880
state                0
lat               6549
long              6549
posting_date        68
dtype: int64


In [18]:
# ['condition', 'cylinders', 'VIN', 'drive', 'size', 'paint_color', 'county']
has_many_null_col_list = list(null_count_series[null_count_series > 100000].index)
drop_col_list = ['url', 'image_url', 'posting_date', 'description', 'region_url', 'id']
drop_col_list += has_many_null_col_list

df_drop_unusful = df.drop(drop_col_list, axis=1)

df_drop_outlier = check_dataset(df_drop_unusful.copy())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   region        426880 non-null  object 
 1   price         426880 non-null  int64  
 2   year          425675 non-null  float64
 3   manufacturer  409234 non-null  object 
 4   model         421603 non-null  object 
 5   fuel          423867 non-null  object 
 6   odometer      422480 non-null  float64
 7   title_status  418638 non-null  object 
 8   transmission  424324 non-null  object 
 9   type          334022 non-null  object 
 10  state         426880 non-null  object 
 11  lat           420331 non-null  float64
 12  long          420331 non-null  float64
dtypes: float64(4), int64(1), object(8)
memory usage: 42.3+ MB
None

number of "NULL" value: 148295
droped row : 125786

region(categorical column) has "404" different value
manufacturer(categorical column) has "41" different val

In [23]:
year_threshold = 1990
df_after_thresh = df_drop_outlier[df_drop_outlier['year'] >= year_threshold]
print(df_after_thresh.head(10))

    region  price    year manufacturer                        model fuel  \
27  auburn  33590  2014.0          gmc     sierra 1500 crew cab slt  gas   
28  auburn  22590  2010.0    chevrolet               silverado 1500  gas   
29  auburn  39590  2020.0    chevrolet          silverado 1500 crew  gas   
30  auburn  30990  2017.0       toyota         tundra double cab sr  gas   
31  auburn  15000  2013.0         ford                    f-150 xlt  gas   
32  auburn  27990  2012.0          gmc  sierra 2500 hd extended cab  gas   
33  auburn  34590  2016.0    chevrolet        silverado 1500 double  gas   
34  auburn  35000  2019.0       toyota                       tacoma  gas   
35  auburn  29990  2016.0    chevrolet        colorado extended cab  gas   
36  auburn  38590  2011.0    chevrolet         corvette grand sport  gas   

    odometer title_status transmission    type state      lat       long  
27   57923.0        clean        other  pickup    al  32.5900 -85.480000  
28   71229.0 

In [24]:
for col_name in get_string_col(df_after_thresh):
        if df_after_thresh[col_name].dtype == 'object':
            print(f'{col_name}(categorical column) has "{len(df_after_thresh[col_name].unique())}" different value')

region(categorical column) has "403" different value
manufacturer(categorical column) has "40" different value
model(categorical column) has "16943" different value
fuel(categorical column) has "5" different value
title_status(categorical column) has "6" different value
transmission(categorical column) has "3" different value
type(categorical column) has "13" different value
state(categorical column) has "51" different value


In [25]:
# print(df_after_thresh['model'].unique())
df_after_thresh.drop('model', axis=1, inplace=True)

['sierra 1500 crew cab slt' 'silverado 1500' 'silverado 1500 crew' ...
 'f150, platinum' 'cruze, 2lt' 'gand wagoneer']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [26]:
df_after_thresh.to_csv(r'./dataset/afterPreprocessing.csv')

In [None]:
df_after_thresh = pd.read_csv(r'./dataset/afterPreprocessing.csv')

In [None]:
encoder=OrdinalEncoder()
df.drop(['region','region_url'], axis=1, inplace=True)
# print(df.info())
df_needEncoding=df[get_string_col(df_after_thresh)]
df_notEncoding=df[!get_string_col(df_after_thresh)]

df_encoding=pd.DataFrame(encoder.fit_transform(df_needEncoding), columns=get_string_col(df_after_thresh))

print(df_encoding.head(10))