In [14]:
import pandas as pd
import numpy as np
url="http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
data_columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

df=pd.read_csv(url ,  skiprows=2, index_col=False,header=None,names=data_columns)
df.replace('?',pd.NA,inplace=True)
df.dropna(subset=["symboling"],axis=0,inplace=True)
print(df.describe(include='all'))





         symboling normalized-losses    make fuel-type aspiration  \
count   203.000000               164     203       203        203   
unique         NaN                51      22         2          2   
top            NaN               161  toyota       gas        std   
freq           NaN                11      32       183        166   
mean      0.812808               NaN     NaN       NaN        NaN   
std       1.232575               NaN     NaN       NaN        NaN   
min      -2.000000               NaN     NaN       NaN        NaN   
25%       0.000000               NaN     NaN       NaN        NaN   
50%       1.000000               NaN     NaN       NaN        NaN   
75%       2.000000               NaN     NaN       NaN        NaN   
max       3.000000               NaN     NaN       NaN        NaN   

       num-of-doors body-style drive-wheels engine-location  wheel-base  ...  \
count           201        203          203             203  203.000000  ...   
unique     

In [15]:
mean=df["symboling"].mean()
df["symboling"].replace(np.nan,mean)

0      1
1      2
2      2
3      2
4      1
      ..
198   -1
199   -1
200   -1
201   -1
202   -1
Name: symboling, Length: 203, dtype: int64

In [16]:
print(df)

     symboling normalized-losses         make fuel-type aspiration  \
0            1              <NA>  alfa-romero       gas        std   
1            2               164         audi       gas        std   
2            2               164         audi       gas        std   
3            2              <NA>         audi       gas        std   
4            1               158         audi       gas        std   
..         ...               ...          ...       ...        ...   
198         -1                95        volvo       gas        std   
199         -1                95        volvo       gas      turbo   
200         -1                95        volvo       gas        std   
201         -1                95        volvo    diesel      turbo   
202         -1                95        volvo       gas      turbo   

    num-of-doors body-style drive-wheels engine-location  wheel-base  ...  \
0            two  hatchback          rwd           front        94.5  ...   
1    

In [17]:
print(df.info)

<bound method DataFrame.info of      symboling normalized-losses         make fuel-type aspiration  \
0            1              <NA>  alfa-romero       gas        std   
1            2               164         audi       gas        std   
2            2               164         audi       gas        std   
3            2              <NA>         audi       gas        std   
4            1               158         audi       gas        std   
..         ...               ...          ...       ...        ...   
198         -1                95        volvo       gas        std   
199         -1                95        volvo       gas      turbo   
200         -1                95        volvo       gas        std   
201         -1                95        volvo    diesel      turbo   
202         -1                95        volvo       gas      turbo   

    num-of-doors body-style drive-wheels engine-location  wheel-base  ...  \
0            two  hatchback          rwd          

In [18]:
print(df.head(2))


   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          1              <NA>  alfa-romero       gas        std          two   
1          2               164         audi       gas        std         four   

  body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  hatchback          rwd           front        94.5  ...          152   
1      sedan          fwd           front        99.8  ...          109   

   fuel-system  bore  stroke compression-ratio horsepower  peak-rpm city-mpg  \
0         mpfi  2.68    3.47               9.0        154      5000       19   
1         mpfi  3.19    3.40              10.0        102      5500       24   

  highway-mpg  price  
0          26  16500  
1          30  13950  

[2 rows x 26 columns]


In [19]:
print(df.tail(2))

     symboling normalized-losses   make fuel-type aspiration num-of-doors  \
201         -1                95  volvo    diesel      turbo         four   
202         -1                95  volvo       gas      turbo         four   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
201      sedan          rwd           front       109.1  ...          145   
202      sedan          rwd           front       109.1  ...          141   

     fuel-system  bore  stroke compression-ratio horsepower  peak-rpm  \
201          idi  3.01    3.40              23.0        106      4800   
202         mpfi  3.78    3.15               9.5        114      5400   

    city-mpg highway-mpg  price  
201       26          27  22470  
202       19          25  22625  

[2 rows x 26 columns]


In [20]:
bins=np.linspace(min(df['symboling']),max(df['symboling']),4)
g_names=['low','medium','high']
df['price_binned']=pd.cut(df['symboling'],bins,labels=g_names,include_lowest=True)
print(df['price_binned'])

0      medium
1        high
2        high
3        high
4      medium
        ...  
198       low
199       low
200       low
201       low
202       low
Name: price_binned, Length: 203, dtype: category
Categories (3, object): ['low' < 'medium' < 'high']


In [21]:
print(pd.get_dummies(df['fuel-type']))

     diesel    gas
0     False   True
1     False   True
2     False   True
3     False   True
4     False   True
..      ...    ...
198   False   True
199   False   True
200   False   True
201    True  False
202   False   True

[203 rows x 2 columns]


In [22]:
missing_data = df.isnull()
missing_data.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,price_binned
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")    

symboling
symboling
False    203
Name: count, dtype: int64

normalized-losses
normalized-losses
False    164
True      39
Name: count, dtype: int64

make
make
False    203
Name: count, dtype: int64

fuel-type
fuel-type
False    203
Name: count, dtype: int64

aspiration
aspiration
False    203
Name: count, dtype: int64

num-of-doors
num-of-doors
False    201
True       2
Name: count, dtype: int64

body-style
body-style
False    203
Name: count, dtype: int64

drive-wheels
drive-wheels
False    203
Name: count, dtype: int64

engine-location
engine-location
False    203
Name: count, dtype: int64

wheel-base
wheel-base
False    203
Name: count, dtype: int64

length
length
False    203
Name: count, dtype: int64

width
width
False    203
Name: count, dtype: int64

height
height
False    203
Name: count, dtype: int64

curb-weight
curb-weight
False    203
Name: count, dtype: int64

engine-type
engine-type
False    203
Name: count, dtype: int64

num-of-cylinders
num-of-cylinders
False    203
Nam

In [24]:
##avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
##print("Average of normalized-losses:", avg_norm_loss)

In [25]:
##df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

In [26]:
df['symbol']=df['symboling'].tolist()
print(df['symbol'])

0      1
1      2
2      2
3      2
4      1
      ..
198   -1
199   -1
200   -1
201   -1
202   -1
Name: symbol, Length: 203, dtype: int64


In [27]:
df.rename(columns={'highway-mpg':'highway-L/100km'}, inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-L/100km,price,price_binned,symbol
0,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,2.68,3.47,9.0,154,5000,19,26,16500,medium,1
1,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,3.19,3.4,10.0,102,5500,24,30,13950,high,2
2,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,3.19,3.4,8.0,115,5500,18,22,17450,high,2
3,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,3.19,3.4,8.5,110,5500,19,25,15250,high,2
4,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,3.19,3.4,8.5,110,5500,19,25,17710,medium,1
