# Readme

Predict car prices for dataset imports-85 (https://archive.ics.uci.edu/ml/datasets/automobile)

## Steps
1. Clean data
2. Select columns
3. Fit using KNN
4. Evaluate using k-fold cross validation, measure using RSME

***

# Imports

In [32]:
import pandas as pd

# Define functions

In [80]:
def calc_percentage_missing_values(df, column_name):
    return round((data[data[column_name]=='?'].shape[0] / data.shape[0])*100)

def get_columns_above_missing_values_threshold(df):
    missing_values = pd.DataFrame()
    col_name_missing_values = 'percentage_missing_values'
    
    
    for idx, column_name in enumerate(df.columns):
        missing_values.loc[idx, 'column_name'] = column_name
        missing_values.loc[idx, col_name_missing_values] = calc_percentage_missing_values(data, column_name)

    missing_values = missing_values[missing_values[col_name_missing_values] > 0]
    missing_values.sort_values(by=[col_name_missing_values], ascending=False, inplace=True)
    
    return missing_values

# Get the data

In [44]:
all_columns = [
    'symboling'
    , 'normalized-losses'
    , 'make'
    , 'fuel-type'
    , 'aspiration'
    , 'num-of-doors'
    , 'body-style'
    , 'drive-wheels'
    , 'engine-location'
    , 'wheel-base'
    , 'length'
    , 'width'
    , 'height'
    , 'curb-weight'
    , 'engine-type'
    , 'num-of-cylinders'
    , 'engine-size'
    , 'fuel-system'
    , 'bore'
    , 'stroke'
    , 'compression-ratio'
    , 'horsepower'
    , 'peak-rpm'
    , 'city-mpg'
    , 'highway-mpg'
    , 'price'
]

initially_relevant_columns = [
    'normalized-losses'
    , 'wheel-base'
    , 'length'
    , 'width'
    , 'height'
    , 'curb-weight'
    , 'num-of-cylinders'
    , 'engine-size'
    , 'bore'
    , 'stroke'
    , 'compression-ratio'
    , 'horsepower'
    , 'peak-rpm'
    , 'city-mpg'
    , 'highway-mpg'
    , 'price'
]


data = pd.read_csv('../data/imports-85.data', header=0, names=all_columns, usecols=all_columns)
data = data[initially_relevant_columns]

# Feature engineering
* Missing values are declared as "?"

In [45]:
data.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,four,130,3.47,2.68,9.0,111,5000,21,27,16500
1,?,94.5,171.2,65.5,52.4,2823,six,152,2.68,3.47,9.0,154,5000,19,26,16500
2,164,99.8,176.6,66.2,54.3,2337,four,109,3.19,3.4,10.0,102,5500,24,30,13950
3,164,99.4,176.6,66.4,54.3,2824,five,136,3.19,3.4,8.0,115,5500,18,22,17450
4,?,99.8,177.3,66.3,53.1,2507,five,136,3.19,3.4,8.5,110,5500,19,25,15250


## Handle missing values

* Will drop missing_values bc 20 % missing
* will remove rows where <'bore', 'stroke', 'price', 'horsepower', 'peak-rpm'> = "?"

In [81]:
missing_values = get_columns_above_missing_values_threshold(data)

missing_values

Unnamed: 0,column_name,percentage_missing_values


In [63]:
# errors='ignore' to make it repeatable
data.drop('normalized-losses', inplace=True, axis=1, errors='ignore')

Unnamed: 0,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,88.6,168.8,64.1,48.8,2548,four,130,3.47,2.68,9.0,111,5000,21,27,16500
1,94.5,171.2,65.5,52.4,2823,six,152,2.68,3.47,9.0,154,5000,19,26,16500
2,99.8,176.6,66.2,54.3,2337,four,109,3.19,3.4,10.0,102,5500,24,30,13950
3,99.4,176.6,66.4,54.3,2824,five,136,3.19,3.4,8.0,115,5500,18,22,17450
4,99.8,177.3,66.3,53.1,2507,five,136,3.19,3.4,8.5,110,5500,19,25,15250


In [74]:
data = data[data['bore']!='?']
data = data[data['horsepower']!='?']
data = data[data['price']!='?']

In [76]:
data.shape

(194, 15)