In [1]:
import pandas as pd
import numpy as np

# Pandas Data Cleaning

In [2]:
df = pd.read_csv("data/vehicles_messy.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [4]:
#how many columns
len(df.columns)

83

In [5]:
#how many values
df.size

3140969

In [6]:
#no of rows
len(df)

37843

In [7]:
#no of rows and no of columns
df.shape

(37843, 83)

## Data Cleaning Steps
1. Handle missing values
2. Look at information contained in columns (variance)
3. Detect outliers
4. Convert types
5. String Operations
6. Treat duplicate

## 1. Handle Missing Values

In [8]:
df.isna()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37839,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37840,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37841,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False


In [9]:
df.isna().sum()

barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31705
phevCity          0
phevHwy           0
phevComb          0
Length: 83, dtype: int64

In [10]:
type(df.isna().sum())

pandas.core.series.Series

In [11]:
missing_values = df.isna().sum()

In [12]:
missing_values.loc[missing_values.gt(0)]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [13]:
missing_values.loc[missing_values > 0]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [14]:
missing_values[missing_values.gt(0)]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [15]:
missing_values[missing_values.gt(0)]/len(df)

cylinders     0.003250
displ         0.003171
drive         0.031419
eng_dscr      0.407024
trany         0.000291
guzzler       0.939725
trans_dscr    0.602384
tCharger      0.862960
sCharger      0.982401
atvType       0.918823
fuelType2     0.962794
rangeA        0.962926
evMotor       0.985149
mfrCode       0.814365
c240Dscr      0.999022
c240bDscr     0.999049
startStop     0.837804
dtype: float64

Drop all the columns that have more than 50% null values. We do not think this information will be useful in further analysis. 

In [16]:
df.drop(columns=["c240bDscr", "c240Dscr"])

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,rangeA,evMotor,mfrCode,charge240b,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [17]:
missing_values_ratios = missing_values[missing_values.gt(0)]/len(df)
missing_values_ratios

cylinders     0.003250
displ         0.003171
drive         0.031419
eng_dscr      0.407024
trany         0.000291
guzzler       0.939725
trans_dscr    0.602384
tCharger      0.862960
sCharger      0.982401
atvType       0.918823
fuelType2     0.962794
rangeA        0.962926
evMotor       0.985149
mfrCode       0.814365
c240Dscr      0.999022
c240bDscr     0.999049
startStop     0.837804
dtype: float64

In [18]:
missing_values_ratios.loc[missing_values_ratios.gt(0.5)].index

Index(['guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'c240bDscr', 'startStop'],
      dtype='object')

In [19]:
df_copy = df.drop(columns=missing_values_ratios.loc[missing_values_ratios.gt(0.5)].index).copy()

In [20]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [21]:
df_copy.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,UHighwayA,VClass,year,youSaveSpend,charge240b,createdOn,modifiedOn,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,0.0,Two Seaters,1985,-1250,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,0.0,Two Seaters,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,0.0,Subcompact Cars,1985,500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,0.0,Vans,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,0.0,Compact Cars,1993,-4000,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0


In [25]:
#what about the columns that still have missing values, but aren't over 50%
(df_copy.isna().sum())[df_copy.isna().sum() > 0]

cylinders      123
displ          120
drive         1189
eng_dscr     15403
trany           11
dtype: int64

In [26]:
# we have the ability to wrap this in a function
def cols_missing_value(df):
    return (df.isna().sum())[df.isna().sum() > 0]

In [27]:
cols_missing_value(df_copy)

cylinders      123
displ          120
drive         1189
eng_dscr     15403
trany           11
dtype: int64

Investigate missing values in the `cylinders` column

In [34]:
relevant_columns = ["year", "make", "trany", "drive", "fuelType", "cylinders", "displ"]
df_copy.loc[df_copy["cylinders"].isna(), relevant_columns].head(30)

Unnamed: 0,year,make,trany,drive,fuelType,cylinders,displ
7138,2000,Nissan,,,Electricity,,
7139,2000,Toyota,,2-Wheel Drive,Electricity,,
8143,2001,Toyota,,2-Wheel Drive,Electricity,,
8144,2001,Ford,,,Electricity,,
8146,2001,Ford,,2-Wheel Drive,Electricity,,
8147,2001,Nissan,,,Electricity,,
9212,2002,Toyota,,2-Wheel Drive,Electricity,,
9213,2002,Ford,,2-Wheel Drive,Electricity,,
10329,2003,Toyota,,2-Wheel Drive,Electricity,,
21413,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,


In [35]:
df_copy.loc[df_copy["cylinders"].isna(), "fuelType"].value_counts()

Electricity    120
Regular          3
Name: fuelType, dtype: int64

In [39]:
#where cylinders is NA and where is fuelType Electricity
df_copy.loc[(df_copy["cylinders"].isna()) & (df_copy["fuelType"] == "Electricity"), "cylinders"] = 0

In [40]:
df_copy.loc[df_copy["cylinders"].isna(), "fuelType"].value_counts()

Regular    3
Name: fuelType, dtype: int64

In [41]:
df_copy.loc[df_copy["cylinders"].isna(), relevant_columns]

Unnamed: 0,year,make,trany,drive,fuelType,cylinders,displ
21413,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,
21414,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,
21506,1986,Mazda,Manual 5-spd,Rear-Wheel Drive,Regular,,1.3


Fill the remaining NAs in `cylinders` with 0

In [44]:
df_copy["cylinders"] = df_copy["cylinders"].fillna(0)

## 2. Look at information contained in a column (Cardinality/Variance)

#### Example, What if we had one column with only 1 value

In [46]:
df_copy.assign(vehicle=True)["vehicle"].value_counts()

True    37843
Name: vehicle, dtype: int64

In [47]:
len(df_copy.assign(vehicle=True)["vehicle"].value_counts())

1

This columns would not give us any useful information as it only has 1 unique value. 

Look at other numerical columns:

In [52]:
df_copy[["barrels08"]].sort_values("barrels08")

Unnamed: 0,barrels08
25835,0.060000
24539,0.060000
27174,0.060000
28455,0.060000
14370,0.066429
...,...
31336,47.087143
7901,47.087143
21060,47.087143
33860,47.087143


In [53]:
df_copy[["barrels08"]].describe()

Unnamed: 0,barrels08
count,37843.0
mean,17.532506
std,4.57595
min,0.06
25%,14.33087
50%,17.347895
75%,20.600625
max,47.087143


If 90% of the rows in the column have the same value, we might as well drop it. 
- 1. for a given column, identify the minimum value
- 2. for the same column, identify the 90% percentile
- 3. if min == 90th per, drop the column 

In [59]:
#1 
min_b = df_copy["barrels08"].min()

#2
n_perc = np.percentile(df_copy["barrels08"], 90)

#3
min_b == n_perc

False