In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
car_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data", 
                    names = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
                             'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 
                             'height', 'curb_weight', 'engine_type', 'num_of_cylinders', 'engine_size', 
                             'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower', 
                             'peak_rpm', 'city_mpg', 'highway_mpg', 'price'])
car_df.shape

(205, 26)

In [4]:
car_df.head(2).transpose()

Unnamed: 0,0,1
symboling,3,3
normalized_losses,?,?
make,alfa-romero,alfa-romero
fuel_type,gas,gas
aspiration,std,std
num_of_doors,two,two
body_style,convertible,convertible
drive_wheels,rwd,rwd
engine_location,front,front
wheel_base,88.6,88.6


In [5]:
car_df.dtypes

symboling              int64
normalized_losses     object
make                  object
fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders      object
engine_size            int64
fuel_system           object
bore                  object
stroke                object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
price                 object
dtype: object

In [6]:
# dropping the columns due to low variance filter. i.e an attribute which is mostly one type of data is not a good dimension.
car_df = car_df.drop(['fuel_type', 'engine_location', 'num_of_doors', 'body_style', 'drive_wheels', 'engine_type', 'fuel_system', 
                     'aspiration', 'normalized_losses', 'make'], axis = 1)

In [7]:
car_df.dtypes

symboling              int64
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
num_of_cylinders      object
engine_size            int64
bore                  object
stroke                object
compression_ratio    float64
horsepower            object
peak_rpm              object
city_mpg               int64
highway_mpg            int64
price                 object
dtype: object

In [22]:
car_df.num_of_cylinders.value_counts()
car_df['cylinders_count'] = car_df.num_of_cylinders.replace({'one': 1, 'two': 2, 'three' : 3, 'four' : 4, 
                                                             'five' : 5, 'six': 6, 'seven' : 7, 'eight' : 8,
                                                             'nine' : 9, 'ten': 10, 'eleven': 11, 'twelve': 12})

In [23]:
car_df = car_df.replace('?', np.nan) # replace ? with NA which is equivalent of null
car_df[car_df.isnull().any(axis = 1)] # display records with NA

Unnamed: 0,symboling,wheel_base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price,cylinders_count
9,0,99.5,178.2,67.9,52.0,3053,five,131,3.13,3.4,7.0,160.0,5500.0,16,22,,5
44,1,94.5,155.9,63.6,52.0,1874,four,90,3.03,3.11,9.6,70.0,5400.0,38,43,,4
45,0,94.5,155.9,63.6,52.0,1909,four,90,3.03,3.11,9.6,70.0,5400.0,38,43,,4
55,3,95.3,169.0,65.7,49.6,2380,two,70,,,9.4,101.0,6000.0,17,23,10945.0,2
56,3,95.3,169.0,65.7,49.6,2380,two,70,,,9.4,101.0,6000.0,17,23,11845.0,2
57,3,95.3,169.0,65.7,49.6,2385,two,70,,,9.4,101.0,6000.0,17,23,13645.0,2
58,3,95.3,169.0,65.7,49.6,2500,two,80,,,9.4,135.0,6000.0,16,23,15645.0,2
129,1,98.4,175.7,72.3,50.5,3366,eight,203,3.94,3.11,10.0,288.0,5750.0,17,28,,8
130,0,96.1,181.5,66.5,55.2,2579,four,132,3.46,3.9,8.7,,,23,31,9295.0,4
131,2,96.1,176.8,66.6,50.5,2460,four,132,3.46,3.9,8.7,,,23,31,9895.0,4


In [24]:
car_df = car_df.astype({
    'bore' : 'float64',
    'stroke' : 'float64',
    'horsepower' : 'float64',
    'peak_rpm' : 'float64',
    'price' : 'float64',
    'cylinders_count' : 'float64'
    })
car_df.dtypes

symboling              int64
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
num_of_cylinders      object
engine_size            int64
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
cylinders_count        int64
dtype: object

In [26]:
# Fillup NaN in numeric columns with their median values.
car_df.price = car_df.price.fillna(car_df.price.median())
car_df.bore = car_df.bore.fillna(car_df.bore.median())
car_df.horsepower = car_df.horsepower.fillna(car_df.horsepower.median())
car_df.peak_rpm = car_df.peak_rpm.fillna(car_df.peak_rpm.median())
car_df.stroke =car_df.stroke.fillna(car_df.stroke.median())
car_df.cylinders_count = car_df.cylinders_count.fillna(car_df.cylinders_count.median())