In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')                  # Ignore/Do not display warnings

sns.set(color_codes=False)                          # Displays charts with background colors
%matplotlib inline

main_data= pd.read_csv('auto-mpg.csv');   # Load the data

In [2]:
main_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
nulls_series             = main_data.isna().sum()         # Get a series counting number of empty values for each column
nonnulls_series          = main_data.notnull().sum()      # Get a series counting number of non empty valuesfor each column 
column_datatypes         = main_data.dtypes               # Get a series containing data types for each column 

nulls_count_df = pd.concat(
    [nulls_series, nonnulls_series, column_datatypes],    # Combine the series created above into a dataframe.
    axis=1,                                               # Concatenate using columns as axis. 
    keys=["Nulls", "Non-Nulls", "Type"],                  # Add Columns as for the nulls and non-nulls values
    sort=True)

print("Rows x Columns = " + str(main_data.shape) + "\n")  # Print out the dimensions of the data frame
print(nulls_count_df)                                     # Display the nulls count table

Rows x Columns = (398, 9)

              Nulls  Non-Nulls     Type
acceleration      0        398  float64
car name          0        398   object
cylinders         0        398    int64
displacement      0        398  float64
horsepower        0        398   object
model year        0        398    int64
mpg               0        398  float64
origin            0        398    int64
weight            0        398    int64


In [5]:
main_data = main_data.drop('car name', axis=1)
main_data['origin'] = main_data['origin'].replace({1:'america', 2:'europe', 3:'asia'})
main_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [6]:
main_data = pd.get_dummies(main_data, columns=['origin'])
main_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,1,0,0


In [7]:
main_data.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [11]:
main_data.dtypes

mpg               float64
cylinders           int64
displacement      float64
horsepower         object
weight              int64
acceleration      float64
model year          int64
origin_america      uint8
origin_asia         uint8
origin_europe       uint8
dtype: object

In [15]:
hpIsDigit = pd.DataFrame(main_data.horsepower.str.isdigit())
main_data[hpIsDigit['horsepower'] == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,?,2046,19.0,71,1,0,0
126,21.0,6,200.0,?,2875,17.0,74,1,0,0
330,40.9,4,85.0,?,1835,17.3,80,0,0,1
336,23.6,4,140.0,?,2905,14.3,80,1,0,0
354,34.5,4,100.0,?,2320,15.8,81,0,0,1
374,23.0,4,151.0,?,3035,20.5,82,1,0,0
