In [1]:
import pandas as pd
import numpy as np

# Pandas Data Cleaning

In [2]:
df = pd.read_csv("data/vehicles_messy.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [4]:
#how many columns
len(df.columns)

83

In [5]:
#how many values
df.size

3140969

In [6]:
#no of rows
len(df)

37843

In [7]:
#no of rows and no of columns
df.shape

(37843, 83)

## Data Cleaning Steps
1. Handle missing values
2. Look at information contained in columns (variance)
3. Detect outliers
4. Convert types
5. String Operations
6. Treat duplicate

## 1. Handle Missing Values

In [8]:
df.isna()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37839,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37840,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False
37841,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,True,False,False,False


In [9]:
df.isna().sum()

barrels08         0
barrelsA08        0
charge120         0
charge240         0
city08            0
              ...  
modifiedOn        0
startStop     31705
phevCity          0
phevHwy           0
phevComb          0
Length: 83, dtype: int64

In [10]:
type(df.isna().sum())

pandas.core.series.Series

In [11]:
missing_values = df.isna().sum()

In [12]:
missing_values.loc[missing_values.gt(0)]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [13]:
missing_values.loc[missing_values > 0]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [14]:
missing_values[missing_values.gt(0)]

cylinders       123
displ           120
drive          1189
eng_dscr      15403
trany            11
guzzler       35562
trans_dscr    22796
tCharger      32657
sCharger      37177
atvType       34771
fuelType2     36435
rangeA        36440
evMotor       37281
mfrCode       30818
c240Dscr      37806
c240bDscr     37807
startStop     31705
dtype: int64

In [15]:
missing_values[missing_values.gt(0)]/len(df)

cylinders     0.003250
displ         0.003171
drive         0.031419
eng_dscr      0.407024
trany         0.000291
guzzler       0.939725
trans_dscr    0.602384
tCharger      0.862960
sCharger      0.982401
atvType       0.918823
fuelType2     0.962794
rangeA        0.962926
evMotor       0.985149
mfrCode       0.814365
c240Dscr      0.999022
c240bDscr     0.999049
startStop     0.837804
dtype: float64

Drop all the columns that have more than 50% null values. We do not think this information will be useful in further analysis. 

In [16]:
df.drop(columns=["c240bDscr", "c240Dscr"])

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,rangeA,evMotor,mfrCode,charge240b,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37839,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37840,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
37841,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [17]:
missing_values_ratios = missing_values[missing_values.gt(0)]/len(df)
missing_values_ratios

cylinders     0.003250
displ         0.003171
drive         0.031419
eng_dscr      0.407024
trany         0.000291
guzzler       0.939725
trans_dscr    0.602384
tCharger      0.862960
sCharger      0.982401
atvType       0.918823
fuelType2     0.962794
rangeA        0.962926
evMotor       0.985149
mfrCode       0.814365
c240Dscr      0.999022
c240bDscr     0.999049
startStop     0.837804
dtype: float64

In [18]:
missing_values_ratios.loc[missing_values_ratios.gt(0.5)].index

Index(['guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'c240bDscr', 'startStop'],
      dtype='object')

In [19]:
df_copy = df.drop(columns=missing_values_ratios.loc[missing_values_ratios.gt(0.5)].index).copy()

In [20]:
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [21]:
df_copy.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,UHighwayA,VClass,year,youSaveSpend,charge240b,createdOn,modifiedOn,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,0.0,Two Seaters,1985,-1250,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,0.0,Two Seaters,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,0.0,Subcompact Cars,1985,500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,0.0,Vans,1985,-8500,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,0.0,Compact Cars,1993,-4000,0.0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,0,0,0


In [22]:
#what about the columns that still have missing values, but aren't over 50%
(df_copy.isna().sum())[df_copy.isna().sum() > 0]

cylinders      123
displ          120
drive         1189
eng_dscr     15403
trany           11
dtype: int64

In [23]:
# we have the ability to wrap this in a function
def cols_missing_value(df):
    return (df.isna().sum())[df.isna().sum() > 0]

In [24]:
cols_missing_value(df_copy)

cylinders      123
displ          120
drive         1189
eng_dscr     15403
trany           11
dtype: int64

Investigate missing values in the `cylinders` column

In [25]:
relevant_columns = ["year", "make", "trany", "drive", "fuelType", "cylinders", "displ"]
df_copy.loc[df_copy["cylinders"].isna(), relevant_columns].head(30)

Unnamed: 0,year,make,trany,drive,fuelType,cylinders,displ
7138,2000,Nissan,,,Electricity,,
7139,2000,Toyota,,2-Wheel Drive,Electricity,,
8143,2001,Toyota,,2-Wheel Drive,Electricity,,
8144,2001,Ford,,,Electricity,,
8146,2001,Ford,,2-Wheel Drive,Electricity,,
8147,2001,Nissan,,,Electricity,,
9212,2002,Toyota,,2-Wheel Drive,Electricity,,
9213,2002,Ford,,2-Wheel Drive,Electricity,,
10329,2003,Toyota,,2-Wheel Drive,Electricity,,
21413,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,


In [26]:
df_copy.loc[df_copy["cylinders"].isna(), "fuelType"].value_counts()

Electricity    120
Regular          3
Name: fuelType, dtype: int64

In [27]:
#where cylinders is NA and where is fuelType Electricity
df_copy.loc[(df_copy["cylinders"].isna()) & (df_copy["fuelType"] == "Electricity"), "cylinders"] = 0

In [28]:
df_copy.loc[df_copy["cylinders"].isna(), "fuelType"].value_counts()

Regular    3
Name: fuelType, dtype: int64

In [29]:
df_copy.loc[df_copy["cylinders"].isna(), relevant_columns]

Unnamed: 0,year,make,trany,drive,fuelType,cylinders,displ
21413,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,
21414,1985,Subaru,Manual 5-spd,4-Wheel Drive,Regular,,
21506,1986,Mazda,Manual 5-spd,Rear-Wheel Drive,Regular,,1.3


Fill the remaining NAs in `cylinders` with 0

In [30]:
df_copy["cylinders"] = df_copy["cylinders"].fillna(0)

## 2. Look at information contained in a column (Cardinality/Variance)

#### Example, What if we had one column with only 1 value

In [31]:
df_copy.assign(vehicle=True)["vehicle"].value_counts()

True    37843
Name: vehicle, dtype: int64

In [32]:
len(df_copy.assign(vehicle=True)["vehicle"].value_counts())

1

This columns would not give us any useful information as it only has 1 unique value. 

Look at other numerical columns:

In [33]:
df_copy[["barrels08"]].sort_values("barrels08")

Unnamed: 0,barrels08
25835,0.060000
24539,0.060000
27174,0.060000
28455,0.060000
14370,0.066429
...,...
31336,47.087143
7901,47.087143
21060,47.087143
33860,47.087143


In [34]:
df_copy[["barrels08"]].describe()

Unnamed: 0,barrels08
count,37843.0
mean,17.532506
std,4.57595
min,0.06
25%,14.33087
50%,17.347895
75%,20.600625
max,47.087143


If 90% of the rows in the column have the same value, we might as well drop it. 
- 1. for a given column, identify the minimum value
- 2. for the same column, identify the 90% percentile
- 3. if min == 90th per, drop the column 

In [35]:
#1 
min_b = df_copy["barrels08"].min()

#2
n_perc = np.percentile(df_copy["barrels08"], 90)

#3
min_b == n_perc

False

In this case we would not drop `barrels08` because there is higher cardinality of the data. We now would like to apply this test to all numerical columns in the dataframe and drop those in which the test returns True.

In [36]:
df_copy.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn', 'modifiedOn

How to get a list of all numerical columns from a DataFrame

In [37]:
df_copy.select_dtypes(include=[np.number]).columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'engId', 'feScore', 'fuelCost08', 'fuelCostA08', 'ghgScore',
       'ghgScoreA', 'highway08', 'highway08U', 'highwayA08', 'highwayA08U',
       'highwayCD', 'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4',
       'pv2', 'pv4', 'range', 'rangeCity', 'rangeCityA', 'rangeHwy',
       'rangeHwyA', 'UCity', 'UCityA', 'UHighway', 'UHighwayA', 'year',
       'youSaveSpend', 'charge240b', 'phevCity', 'phevHwy', 'phevComb'],
      dtype='object')

In [41]:
#getting a list of all columns that we would drop
to_drop = []

for col in df_copy.select_dtypes(include=[np.number]).columns:
    min_value = df_copy[col].min()
    perc_value = np.percentile(df_copy[col], 90)
    if min_value == perc_value:
        to_drop.append(col)

In [42]:
to_drop

['barrelsA08',
 'charge120',
 'charge240',
 'cityA08',
 'cityA08U',
 'cityCD',
 'cityE',
 'cityUF',
 'co2A',
 'co2TailpipeAGpm',
 'combA08',
 'combA08U',
 'combE',
 'combinedCD',
 'combinedUF',
 'fuelCostA08',
 'ghgScoreA',
 'highwayA08',
 'highwayA08U',
 'highwayCD',
 'highwayE',
 'highwayUF',
 'range',
 'rangeCity',
 'rangeCityA',
 'rangeHwy',
 'rangeHwyA',
 'UCityA',
 'UHighwayA',
 'charge240b',
 'phevCity',
 'phevHwy',
 'phevComb']

Alright, let's drop them

In [43]:
df_copy = df_copy.drop(columns=to_drop)

## 3. Detect Outliers
Using interquartile Range (IQR)

1. For a given column, calculate the IQR (75th perc - 25th perc)
2. Determine a tolerance factor so we do not remove important data variation
3. Determine the lower and upper threshold based on the IQR and tolerance factor
4. Label all values that are beyond the thresholds as outliers

In [45]:
df_copy[["barrels08"]].describe()

Unnamed: 0,barrels08
count,37843.0
mean,17.532506
std,4.57595
min,0.06
25%,14.33087
50%,17.347895
75%,20.600625
max,47.087143


In [48]:
#transposing in order to find the 25% and 75%
df_copy[["barrels08"]].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
barrels08,37843.0,17.532506,4.57595,0.06,14.33087,17.347895,20.600625,47.087143


In [52]:
barrels_desc = df_copy[["barrels08"]].describe().transpose()

In [53]:
#creating an IQR column value based off of the IQR formula
barrels_desc = barrels_desc.assign(IQR=barrels_desc["75%"] - barrels_desc["25%"])

In [54]:
barrels_desc.loc["barrels08", "25%"]

14.330869565217393

In [55]:
# Steps 2 and 3
tol_factor = 3
#multiply the tolerance factor by the IQR and add to 75% (for upper bound) and subtract from 25% (for lower bound)
#python observes PEMDAS (order of operations), so we do not need to put multiplication in parenthesis
barrels_lower_threshold = barrels_desc.loc["barrels08", "25%"] - tol_factor * barrels_desc.loc["barrels08", "IQR"]
barrels_upper_threshold = barrels_desc.loc["barrels08", "75%"] + tol_factor * barrels_desc.loc["barrels08", "IQR"]

In [56]:
barrels_lower_threshold

-4.47839673913043

In [57]:
barrels_upper_threshold

39.40989130434782

In this scenario, we would label all rows as 'outliers' if their value for `barrels08` is beyond the thresholds. That is, if `barrels08` is larger than 39.41 or lower `-4.48`.

Our next step will be to build this test so it will work on our entire dataframe.

For all numerical columns:
- Iterate over all numerical columns
- for each column, calculate the IQR
- Based on the IQR, define the lower and uper threshold for acceptable values
- Identify all outliers
- Append the DataFrame outliers with the rows that were identified as outliers

In [59]:
#adding a new column 'outlier_column' to keep track of the column that caused a row to be an outlier
cols = list(df_copy).append("outlier_column")
#create a new dataframe in which we will append the outlier rows
outliers = pd.DataFrame(columns=cols)

for col in df_copy.select_dtypes(include=[np.number]).columns:
    descr = df_copy[[col]].describe().transpose()
    iqr = descr.loc[col, "75%"] - descr.loc[col, "25%"]
    cutoff = iqr *12 #making our threshold very large so we only remove outliers, not variation
    lower_threshold = descr.loc[col, "25%"] - cutoff
    upper_threshold = descr.loc[col, "75%"] + cutoff
    
    results = df.loc[(df[col] < lower_threshold) | (df[col] > upper_threshold)].copy()
    outliers = pd.concat([outliers, results.assign(outlier_column=col)], sort=False)

In [61]:
outliers.shape

(62529, 84)

In [62]:
# get the number of rows that are not duplicated
len(outliers.drop_duplicates())

62529

It appears that there are no duplicated rows, but lets check the length of our dataframe

In [63]:
df_copy.shape

(37843, 38)

There are more outliers than there are rows in our original df_copy. Let's check a couple other ways to see if there are repeted rows

In [65]:
len(set(outliers.index))

16479

In [66]:
outliers.index.to_series().value_counts()

29750    11
30142    11
28607    11
25715    11
30972    11
         ..
26905     1
953       1
27750     1
1251      1
115       1
Length: 16479, dtype: int64

In [67]:
# investigating a single case
outliers.loc[29750,]

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb,outlier_column
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,city08
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,city08U
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,co2
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,comb08
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,comb08U
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,feScore
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,ghgScore
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,highway08U
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,hlv
29750,0.18,0.0,0.0,8.0,126,126.4057,0,0.0,0.0,26.6635,...,3.6 kW charger,5.0,6.6 kW charger,Mon Oct 19 00:00:00 EDT 2015,Wed Dec 02 00:00:00 EST 2015,N,0,0,0,hpv


The rows are not deduplicating because the the outlier column. The values in this column are not idential, so it does not recognize it as a duplicate. 

The next step in analysis would be to decide in how many columns a row must be an outlier in order to drop it. 
HW: Decide what your cut off point would be, and try to drop only rows that are outliers in that many columns.

## 4. Covert data types

In [78]:
df_copy.dtypes

barrels08         float64
city08              int64
city08U           float64
co2                 int64
co2TailpipeGpm    float64
comb08              int64
comb08U           float64
cylinders         float64
displ             float64
drive              object
engId               int64
eng_dscr           object
feScore             int64
fuelCost08          int64
fuelType           object
fuelType1          object
ghgScore            int64
highway08           int64
highway08U        float64
hlv                 int64
hpv                 int64
id                  int64
lv2                 int64
lv4                 int64
make               object
model              object
mpgData            object
phevBlended          bool
pv2                 int64
pv4                 int64
trany              object
UCity             float64
UHighway          float64
VClass             object
year                int64
youSaveSpend        int64
createdOn          object
modifiedOn         object
dtype: objec

Make `year` a datetime object

In [79]:
df_copy["year"] = pd.to_datetime(df["year"], format="%Y") #explicitly pass the string format

In [80]:
df_copy.dtypes

barrels08                float64
city08                     int64
city08U                  float64
co2                        int64
co2TailpipeGpm           float64
comb08                     int64
comb08U                  float64
cylinders                float64
displ                    float64
drive                     object
engId                      int64
eng_dscr                  object
feScore                    int64
fuelCost08                 int64
fuelType                  object
fuelType1                 object
ghgScore                   int64
highway08                  int64
highway08U               float64
hlv                        int64
hpv                        int64
id                         int64
lv2                        int64
lv4                        int64
make                      object
model                     object
mpgData                   object
phevBlended                 bool
pv2                        int64
pv4                        int64
trany     

## 5. String Cleaning

In [81]:
df_copy["trany"].value_counts()

Automatic 4-spd                     11042
Manual 5-spd                         8311
Automatic 3-spd                      3151
Automatic (S6)                       2638
Manual 6-spd                         2429
Automatic 5-spd                      2184
Manual 4-spd                         1483
Automatic 6-spd                      1432
Automatic (S8)                        960
Automatic (S5)                        824
Automatic (variable gear ratios)      681
Automatic 7-spd                       663
Automatic (S7)                        261
Auto(AM-S7)                           256
Automatic 8-spd                       243
Automatic (S4)                        233
Auto(AM7)                             160
Auto(AV-S6)                           145
Auto(AM6)                             110
Automatic (A1)                        109
Auto(AM-S6)                            92
Automatic 9-spd                        90
Manual 3-spd                           77
Manual 7-spd                      

In [82]:
len(df_copy["trany"].value_counts())

46

There probably aren't that many different types of transmissions. We will try to clean this column so that it is more realistic

In [84]:
df_copy["trany"].str.replace("Automatic", "Auto").value_counts()

Auto 4-spd                     11042
Manual 5-spd                    8311
Auto 3-spd                      3151
Auto (S6)                       2638
Manual 6-spd                    2429
Auto 5-spd                      2184
Manual 4-spd                    1483
Auto 6-spd                      1432
Auto (S8)                        960
Auto (S5)                        824
Auto (variable gear ratios)      681
Auto 7-spd                       663
Auto (S7)                        261
Auto(AM-S7)                      256
Auto 8-spd                       243
Auto (S4)                        233
Auto(AM7)                        160
Auto(AV-S6)                      145
Auto(AM6)                        110
Auto (A1)                        109
Auto(AM-S6)                       92
Auto 9-spd                        90
Manual 3-spd                      77
Manual 7-spd                      68
Auto(AV-S7)                       63
Auto(AV-S8)                       26
Auto (S9)                         26
M

In [85]:
len(df_copy["trany"].str.replace("Automatic", "Auto").value_counts())

44

We were able to consolidate 2 values. Next step will be to try to remove other things that make values that are equivilant but have different string representations.

In [87]:
#checking to see if these changes made a difference in order to decide if we should keep them
len(df_copy["trany"]
    .str.replace("Automatic", "Auto")
    .str.replace("Auto\(", "Auto ")
    .str.replace("Manual\(", "Manual ")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "").value_counts())

38

In [101]:
#made enough of a difference and we decide to keep them
clean_trany = (df_copy["trany"]
                .str.replace("Automatic", "Auto")
                .str.replace("Auto\(", "Auto ")
                .str.replace("Manual\(", "Manual ")
                .str.replace("(", "")
                .str.replace(")", "")
                .str.replace("-", ""))

In [102]:
clean_trany.value_counts()

Auto 4spd                    11042
Manual 5spd                   8311
Auto 3spd                     3151
Auto S6                       2638
Manual 6spd                   2429
Auto 5spd                     2184
Manual 4spd                   1483
Auto 6spd                     1433
Auto S8                        960
Auto S5                        824
Auto variable gear ratios      681
Auto 7spd                      663
Auto S7                        261
Auto AMS7                      256
Auto 8spd                      243
Auto S4                        233
Auto AM7                       160
Auto AVS6                      155
Auto AM6                       111
Auto A1                        110
Auto AMS6                       92
Auto 9spd                       90
Manual 3spd                     77
Manual 7spd                     68
Auto AVS7                       63
Auto AVS8                       27
Auto S9                         26
Manual 4spd Doubled             17
Auto AM5            

In [103]:
#splitting the information into two columns, the first column will be the transmition type 
#and the second column will be the number of gears
clean_trany = (clean_trany
               .str.split(" ", expand=True)
               .drop(columns=[2,3])
               .rename(columns={0: "tranType",
                                1: "gears"}))

In [104]:
clean_trany

Unnamed: 0,tranType,gears
0,Manual,5spd
1,Manual,5spd
2,Manual,5spd
3,Auto,3spd
4,Manual,5spd
...,...,...
37838,Auto,4spd
37839,Manual,5spd
37840,Auto,4spd
37841,Manual,5spd


In [105]:
clean_trany["gears"] = clean_trany["gears"].str.replace(r"\D", "")

In [106]:
#replacing empty strings with NA values
clean_trany[clean_trany["gears"] == ""] = pd.NA

In [107]:
#checking the number of rows that have null values
clean_trany.isna().sum()

tranType    698
gears       698
dtype: int64

In [109]:
#replacing the previous trany column with these two columns
df_copy = (df_copy
            .assign(tranType=clean_trany["tranType"])
            .assign(gears=clean_trany["gears"])
            .drop(columns="trany"))

In [114]:
df_copy["tranType"].value_counts()

Auto      24756
Manual    12389
Name: tranType, dtype: int64

In [115]:
df_copy["gears"].value_counts()

4    12777
5    11334
6     6862
3     3230
7     1474
8     1241
9      117
1      110
Name: gears, dtype: int64

## 6. Drop Duplicates
Trying to build up the work flow to create a fucntion that will ask if we want to drop columms or not

In [116]:
print(f"Before dropping duplicates, our DataFrame had {len(df_copy)} rows")

print(f"After dropping duplicates, our DataFrame had {len(df_copy.drop_duplicates())} rows")

Before dropping duplicates, our DataFrame had 37843 rows
After dropping duplicates, our DataFrame had 37843 rows


In [117]:
df_copy.drop_duplicates(["gears"])


Unnamed: 0,barrels08,city08,city08U,co2,co2TailpipeGpm,comb08,comb08U,cylinders,displ,drive,...,pv4,UCity,UHighway,VClass,year,youSaveSpend,createdOn,modifiedOn,tranType,gears
0,15.695714,19,0.0,-1,423.190476,21,0.0,4.0,2.0,Rear-Wheel Drive,...,0,23.3333,35.0,Two Seaters,1985-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5.0
3,29.964545,10,0.0,-1,807.909091,11,0.0,8.0,5.2,Rear-Wheel Drive,...,0,12.2222,16.6667,Vans,1985-01-01,-8500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,3.0
9,13.1844,23,0.0,-1,355.48,25,0.0,4.0,1.8,Front-Wheel Drive,...,89,29.0,42.0,Compact Cars,1993-01-01,0,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,4.0
692,18.311667,15,0.0,-1,493.722222,18,0.0,8.0,5.7,Rear-Wheel Drive,...,0,19.0,32.0,Two Seaters,1994-01-01,-4500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,6.0
2903,10.300313,30,0.0,-1,277.71875,32,0.0,4.0,1.6,Front-Wheel Drive,...,0,39.0,50.0,Subcompact Cars,1996-01-01,1500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,
10412,19.388824,14,0.0,-1,522.764706,17,0.0,8.0,5.0,Rear-Wheel Drive,...,0,17.4,29.1,Two Seaters,2004-01-01,-5250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,7.0
14939,16.4805,17,0.0,-1,444.35,20,0.0,8.0,4.6,Rear-Wheel Drive,...,103,20.8489,34.6499,Midsize Cars,2007-01-01,-3250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,8.0
23026,0.348,62,0.0,0,0.0,58,0.0,0.0,,2-Wheel Drive,...,0,88.6842,76.5909,Standard Pickup Trucks 2WD,2001-01-01,1000,Tue Jan 01 00:00:00 EST 2013,Fri Apr 11 00:00:00 EDT 2014,Auto,1.0
26790,14.982273,19,19.1415,399,399.0,22,22.3746,6.0,3.2,Front-Wheel Drive,...,0,24.1,39.5,Small Sport Utility Vehicle 2WD,2014-01-01,-750,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,9.0


In [118]:
def check_duplicates(df, cols):
    """
    Function to drop duplicates from a df based on a list of columns
    """
    # 1. count no of rows in df before dropping
    # 2. drop duplicate rows based on list of columns
    # 3. count no of rows in df after dropping
    # 4. print no of rows that would be dropped

In [120]:
def check_duplicates(df, cols, return_df=False):
    """
    Function to drop duplicates from a df based on a list of columns
    """
    # 1. count no of rows in df before dropping
    before = len(df)
    
    # 2. drop duplicate rows based on list of columns
    df_after_dropping = df.drop_duplicates(cols)
    
    # 3. count no of rows in df after dropping
    after = len(df_after_dropping)
    
    # 4. calculate percentage of rows that would be dropped
    perc = int(((before - after) / before) * 100)
    
    # 4. print no of rows that would be dropped
    if return_df:
        print(f"This list of columns would drop {before - after} rows. {perc}% of all rows.")
        return df_after_dropping
    else:
        print(f"This list of columns would drop {before - after} rows. {perc}% of all rows.")

In [121]:
check_duplicates(df_copy, ["gears"])


This list of columns would drop 37834 rows. 99% of all rows.


In [123]:
columns = ["make", "model", "year", "displ", "cylinders", "tranType", "gears", "drive",
           "VClass", "fuelType", "barrels08", "city08"]

check_duplicates(df_copy, columns)

This list of columns would drop 1355 rows. 3% of all rows.


Create a function that takes a DataFrame and a list of column names and:
1. Calcuates the number of rows that would get dropped (check_duplicates()) and tell the user
2. Ask the user if they want to see the rows that would get dropped
    - If yes, return and exit
    - If not, ask if the user wants to drop the rows
        - if yes, drop the rows and return the new DataFrame
        - if no, exit

In [124]:
def securely_drop_duplicates(df, cols):
    #Calculate the number of rows that would get dropped (check_duplicates()) and tell the user
    df_after_drop = check_duplicates(df, cols, return_df=True)
    
    #Generate a new df with the rows that would get dropped
    index_before = set(df[cols].index)
    index_after = set(df_after_drop.index)
    index_diff = index_before - index_after
    
    df_dropped_rows = df[cols].loc[index_diff].copy()
    
    #Ask the user, if they want to see the rows that would get dropped
    response1 = input("Before dropping, do you want me to return the rows that would get dropped?")
    
    #2.1. if yes, return and exit
    if response1 == "yes":
        return df_dropped_rows
        
    #2.1. if no, ask if user wants to drop the rows
    elif response1 == "no":
        response2 = input("Do you want me to go ahead and drop the rows?")
    
        #2.1.1 if yes, drop the rows and return new DataFrame
        if response2 == "yes":
            return df_after_drop
        
        #2.1.2. if no, exit
        elif response2 == "no":
            return

In [126]:
securely_drop_duplicates(df_copy, columns)


This list of columns would drop 1355 rows. 3% of all rows.
Before dropping, do you want me to return the rows that would get dropped? [yes|no]yes


Unnamed: 0,make,model,year,displ,cylinders,tranType,gears,drive,VClass,fuelType,barrels08,city08
32778,Cadillac,Fleetwood/DeVille (FWD),1985-01-01,4.3,6.0,Auto,4,Front-Wheel Drive,Large Cars,Diesel,16.616739,19
32782,Pontiac,Firefly,1989-01-01,1.0,3.0,Auto,3,Front-Wheel Drive,Subcompact Cars,Regular,9.694412,32
32784,Pontiac,Firefly,1989-01-01,1.0,3.0,Manual,5,Front-Wheel Drive,Subcompact Cars,Regular,8.039268,38
16402,GMC,C15 Pickup 2WD,1986-01-01,5.0,8.0,Auto,4,Rear-Wheel Drive,Standard Pickup Trucks,Regular,21.974000,14
8222,Mitsubishi,Mirage,1985-01-01,1.6,4.0,Auto,3,Front-Wheel Drive,Subcompact Cars,Premium,14.982273,21
...,...,...,...,...,...,...,...,...,...,...,...,...
24538,Ferrari,California,2012-01-01,4.3,8.0,Auto,7,Rear-Wheel Drive,Two Seaters,Premium,21.974000,13
24541,Ferrari,FF,2012-01-01,6.3,12.0,Auto,7,Part-time 4-Wheel Drive,Midsize Cars,Premium,25.354615,11
32744,Jaguar,XJS Coupe,1989-01-01,5.3,12.0,Auto,3,Rear-Wheel Drive,Subcompact Cars,Regular,25.354615,11
24565,Honda,Accord,1987-01-01,2.0,4.0,Auto,4,Front-Wheel Drive,Compact Cars,Regular,14.330870,21


In [127]:
securely_drop_duplicates(df_copy, columns)


This list of columns would drop 1355 rows. 3% of all rows.
Before dropping, do you want me to return the rows that would get dropped? [yes|no]No
Before dropping, do you want me to return the rows that would get dropped? [yes|no]no
Do you want me to go ahead and drop the rows?yes


Unnamed: 0,barrels08,city08,city08U,co2,co2TailpipeGpm,comb08,comb08U,cylinders,displ,drive,...,pv4,UCity,UHighway,VClass,year,youSaveSpend,createdOn,modifiedOn,tranType,gears
0,15.695714,19,0.0,-1,423.190476,21,0.0,4.0,2.0,Rear-Wheel Drive,...,0,23.3333,35.0000,Two Seaters,1985-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
1,29.964545,9,0.0,-1,807.909091,11,0.0,12.0,4.9,Rear-Wheel Drive,...,0,11.0000,19.0000,Two Seaters,1985-01-01,-8500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
2,12.207778,23,0.0,-1,329.148148,27,0.0,4.0,2.2,Front-Wheel Drive,...,0,29.0000,47.0000,Subcompact Cars,1985-01-01,500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
3,29.964545,10,0.0,-1,807.909091,11,0.0,8.0,5.2,Rear-Wheel Drive,...,0,12.2222,16.6667,Vans,1985-01-01,-8500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,3
4,17.347895,17,0.0,-1,467.736842,19,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,21.0000,32.0000,Compact Cars,1993-01-01,-4000,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,19,0.0,-1,403.954545,22,0.0,4.0,2.2,Front-Wheel Drive,...,90,24.0000,37.0000,Compact Cars,1993-01-01,-750,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,4
37839,14.330870,20,0.0,-1,386.391304,23,0.0,4.0,2.2,Front-Wheel Drive,...,90,25.0000,39.0000,Compact Cars,1993-01-01,-500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
37840,15.695714,18,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,23.0000,34.0000,Compact Cars,1993-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,4
37841,15.695714,18,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,23.0000,34.0000,Compact Cars,1993-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5


In [129]:
df_new = securely_drop_duplicates(df_copy, columns)


This list of columns would drop 1355 rows. 3% of all rows.
Before dropping, do you want me to return the rows that would get dropped? [yes|no]no
Do you want me to go ahead and drop the rows?yes


In [130]:
df_new

Unnamed: 0,barrels08,city08,city08U,co2,co2TailpipeGpm,comb08,comb08U,cylinders,displ,drive,...,pv4,UCity,UHighway,VClass,year,youSaveSpend,createdOn,modifiedOn,tranType,gears
0,15.695714,19,0.0,-1,423.190476,21,0.0,4.0,2.0,Rear-Wheel Drive,...,0,23.3333,35.0000,Two Seaters,1985-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
1,29.964545,9,0.0,-1,807.909091,11,0.0,12.0,4.9,Rear-Wheel Drive,...,0,11.0000,19.0000,Two Seaters,1985-01-01,-8500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
2,12.207778,23,0.0,-1,329.148148,27,0.0,4.0,2.2,Front-Wheel Drive,...,0,29.0000,47.0000,Subcompact Cars,1985-01-01,500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
3,29.964545,10,0.0,-1,807.909091,11,0.0,8.0,5.2,Rear-Wheel Drive,...,0,12.2222,16.6667,Vans,1985-01-01,-8500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,3
4,17.347895,17,0.0,-1,467.736842,19,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,21.0000,32.0000,Compact Cars,1993-01-01,-4000,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37838,14.982273,19,0.0,-1,403.954545,22,0.0,4.0,2.2,Front-Wheel Drive,...,90,24.0000,37.0000,Compact Cars,1993-01-01,-750,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,4
37839,14.330870,20,0.0,-1,386.391304,23,0.0,4.0,2.2,Front-Wheel Drive,...,90,25.0000,39.0000,Compact Cars,1993-01-01,-500,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
37840,15.695714,18,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,23.0000,34.0000,Compact Cars,1993-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Auto,4
37841,15.695714,18,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,90,23.0000,34.0000,Compact Cars,1993-01-01,-1250,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,Manual,5
