## Setup

In [93]:
import pandas as pd
import numpy as np

import requests
from io import StringIO

import string

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [2]:
# file_name = 'data/nyc-parking-violations-2020.csv'
# usecol = ['Plate ID',  'Registration State', 'Vehicle Make', 'Vehicle Color', 'Violation Time', 'Street Name']
# df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

In [57]:
# df.head()

In [58]:
# df.shape

## EXERCISE 25. Parking cleanup

### 25.1 Data exploration

In [11]:
# Remove rows with any missing data (i.e., a NaN value)
df_all_removed = df.dropna()

In [13]:
df.shape[0] - df_all_removed.shape[0]

447359

In [14]:
# Let’s instead assume that a ticket can only be dismissed if the license plate,
# state, car make, and/or street name are missing. Remove rows that are missing
# one or more of these.
df_some_removed = df.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name'])

In [16]:
df.shape[0] - df_some_removed.shape[0]

63785

In [17]:
# Let's instead assume that a ticket can only be dismissed if the license plate, state, and/or
# street name are all there. Remove rows that are missing one or more of these. How many rows remain?
df_some_removed_v2 = df.dropna(subset=['Plate ID', 'Registration State', 'Street Name'])

In [18]:
df.shape[0] - df_some_removed_v2.shape[0]

1618

### 25.2-4 Beyond the exercise

In [5]:
at_least_three_df = df.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name'],
                           thresh=3)
df.shape[0] - at_least_three_df.shape[0]

253

In [7]:
# Which of the columns you’ve imported has the greatest number of NaN values?
nan_counts = df.isna().sum()
nan_counts.sort_values(ascending=False)

Vehicle Color         391982
Vehicle Make           62420
Street Name             1417
Violation Time           278
Plate ID                 202
Registration State         0
dtype: int64

In [8]:
# Null data is bad, but there is plenty of bad non-null data, too. For example,
# many cars with BLANKPLATE as a plate ID were ticketed. Turn these into NaN val-
# ues, and rerun the previous query.
df['Plate ID'] = df['Plate ID'].replace('BLANKPLATE', np.nan)
nan_counts = df.isna().sum()
nan_counts.sort_values(ascending=False)

Vehicle Color         391982
Vehicle Make           62420
Plate ID                9084
Street Name             1417
Violation Time           278
Registration State         0
dtype: int64

## EXERCISE 26. Celebrity deaths

### 26.1 Data exploration

#### 26.1.2 Create a new month column

In [15]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

df.head(), df.dtypes

(  dateofdeath age
 0  2016-01-01  71
 1  2016-01-01  74
 2  2016-01-01  79
 3  2016-01-01  45
 4  2016-01-01  83,
 dateofdeath    object
 age            object
 dtype: object)

In [16]:
# (2) Create a new month column containing the month from the dateofdeath column
df['dateofdeath'] = pd.to_datetime(df['dateofdeath'])

df.head(), df.dtypes

(  dateofdeath age
 0  2016-01-01  71
 1  2016-01-01  74
 2  2016-01-01  79
 3  2016-01-01  45
 4  2016-01-01  83,
 dateofdeath    datetime64[ns]
 age                    object
 dtype: object)

In [24]:
df['dateofdeath'].dt.month

0        1
1        1
2        1
3        1
4        1
        ..
6538    12
6539    12
6540    12
6541    12
6542    12
Name: dateofdeath, Length: 6543, dtype: int32

In [25]:
df['month'] = df['dateofdeath'].dt.month

In [26]:
df.head(), df.dtypes

(  dateofdeath age  month
 0  2016-01-01  71      1
 1  2016-01-01  74      1
 2  2016-01-01  79      1
 3  2016-01-01  45      1
 4  2016-01-01  83      1,
 dateofdeath    datetime64[ns]
 age                    object
 month                   int32
 dtype: object)

#### 26.1.3 Make the month column the index of the data frame

In [28]:
# (3) Make the month column the index of the data frame
df = df.set_index('month')

In [29]:
df.head()

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-01,74
1,2016-01-01,79
1,2016-01-01,45
1,2016-01-01,83


In [30]:
df.shape

(6543, 2)

In [36]:
df.index.is_monotonic_increasing

False

#### 26.1.4 Sort the data frame by the index

In [37]:
# (4) Sort the data frame by the index
df = df.sort_index()

In [38]:
df.index.is_monotonic_increasing

True

#### 26.1.5 Clean all nonintegers from the age column

In [90]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

# Make the month column the index of the data frame
df['dateofdeath'] = pd.to_datetime(df['dateofdeath'])
df['month'] = df['dateofdeath'].dt.month
df = df.set_index('month')

# Sort the data frame by the index
df = df.sort_index()

df.head()

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-21,47
1,2016-01-21,87
1,2016-01-21,90
1,2016-01-21,73


In [93]:
# Check the nan values in the age column
nan_counts = df['age'].isna().sum()
print(f'Number of NaN values in the age column: {nan_counts}')

# Compute the percentage of NaN values in the age column using mean
nan_percentage = df['age'].isna().mean()
print(f'Percentage of NaN values in the age column: {nan_percentage:.2}')

# Fill NaN values in the age column with an empty string
df['age'] = df['age'].fillna('')

# Convert the age column to string type
df['age'] = df['age'].astype(str)

df['age'].dtype

Number of NaN values in the age column: 0
Percentage of NaN values in the age column: 0.0


dtype('O')

In [98]:
df['age'].unique()

array(['71', '47', '87', '90', '73', '37', '59', '75', '97', '78', '67',
       '82', '85', '86', '88', '92', '55', '102', '96', '93', '101', '25',
       '21', '74', '76', '72', '83', '95', '45', '62', '68', '94', '100',
       '84', '91', '52', '79', '', '34', '58', '66', '99', '60', '16',
       '80', '81', '15', '53', '70', '89', '26', '61', '64', '77', '31',
       '14', '44', '98', '112', '43', '56', '33', '57', '65', '69', '48',
       '54', '23', '40', '49', '50', '35', '32', ' 8889', '38', '22',
       '51', '28', '63', '20', '103', ' 6869', '24', '39', '107', '30',
       '41', ' 3031', '46', ' 5253', '11', '42', ' 4445', '12', ' 6364',
       '29', '27', ' 9293', '36', ' 5759', ' 6768', '104', ' 7980', '19',
       ' 3435', ' 68-69', ' 7677', ' 2526', '9', ' c. 48', '17', ' 2930',
       ' 9192', '110', '116', ' 7374', ' 9394', ' 8081', '109', ' c._180',
       '106', ' 4243', '113', ' 51-52', ' 77/78', ' 6667', ' 38/39',
       ' 61/62', ' 8485', ' 6970', ' 59-60', '18', ' 

In [102]:
# Explore non-numeric values in the age column using masking
non_numeric_mask = ~df['age'].str.isnumeric()
numeric_mask = ~non_numeric_mask
df.loc[non_numeric_mask, 'age'].unique()

array(['', ' 8889', ' 6869', ' 3031', ' 5253', ' 4445', ' 6364', ' 9293',
       ' 5759', ' 6768', ' 7980', ' 3435', ' 68-69', ' 7677', ' 2526',
       ' c. 48', ' 2930', ' 9192', ' 7374', ' 9394', ' 8081', ' c._180',
       ' 4243', ' 51-52', ' 77/78', ' 6667', ' 38/39', ' 61/62', ' 8485',
       ' 6970', ' 59-60', ' 50s', ' 56 or 57', ' 7778', ' 8182'],
      dtype=object)

In [103]:
# Remove all non-numeric values from the age column using masking
df = df[numeric_mask]

In [104]:
df['age'].unique()

array(['71', '47', '87', '90', '73', '37', '59', '75', '97', '78', '67',
       '82', '85', '86', '88', '92', '55', '102', '96', '93', '101', '25',
       '21', '74', '76', '72', '83', '95', '45', '62', '68', '94', '100',
       '84', '91', '52', '79', '34', '58', '66', '99', '60', '16', '80',
       '81', '15', '53', '70', '89', '26', '61', '64', '77', '31', '14',
       '44', '98', '112', '43', '56', '33', '57', '65', '69', '48', '54',
       '23', '40', '49', '50', '35', '32', '38', '22', '51', '28', '63',
       '20', '103', '24', '39', '107', '30', '41', '46', '11', '42', '12',
       '29', '27', '36', '104', '19', '9', '17', '110', '116', '109',
       '106', '113', '18', '105', '7', '108'], dtype=object)

In [105]:
df.shape

(6481, 2)

In [124]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

# Make the month column the index of the data frame
df['dateofdeath'] = pd.to_datetime(df['dateofdeath'])
df['month'] = df['dateofdeath'].dt.month
df = df.set_index('month')

# Sort the data frame by the index
df = df.sort_index()

df.head()

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71
1,2016-01-21,47
1,2016-01-21,87
1,2016-01-21,90
1,2016-01-21,73


In [125]:
df.shape

(6543, 2)

In [126]:
df['age'] = pd.to_numeric(df['age'], errors='coerce')

In [127]:
df.shape, df['age'].dtype

((6543, 2), dtype('float64'))

In [128]:
np.set_printoptions(suppress=True)
df['age'].unique()

array([  71.,   47.,   87.,   90.,   73.,   37.,   59.,   75.,   97.,
         78.,   67.,   82.,   85.,   86.,   88.,   92.,   55.,  102.,
         96.,   93.,  101.,   25.,   21.,   74.,   76.,   72.,   83.,
         95.,   45.,   62.,   68.,   94.,  100.,   84.,   91.,   52.,
         79.,   nan,   34.,   58.,   66.,   99.,   60.,   16.,   80.,
         81.,   15.,   53.,   70.,   89.,   26.,   61.,   64.,   77.,
         31.,   14.,   44.,   98.,  112.,   43.,   56.,   33.,   57.,
         65.,   69.,   48.,   54.,   23.,   40.,   49.,   50.,   35.,
         32., 8889.,   38.,   22.,   51.,   28.,   63.,   20.,  103.,
       6869.,   24.,   39.,  107.,   30.,   41., 3031.,   46., 5253.,
         11.,   42., 4445.,   12., 6364.,   29.,   27., 9293.,   36.,
       5759., 6768.,  104., 7980.,   19., 3435., 7677., 2526.,    9.,
         17., 2930., 9192.,  110.,  116., 7374., 9394., 8081.,  109.,
        106., 4243.,  113., 6667., 8485., 6970.,   18.,  105.,    7.,
       7778., 8182.,

In [129]:
np.set_printoptions(suppress=False)

In [130]:
df = df.dropna(subset=['age'])

# Remove all ages of 100 and above
df = df[df['age'] < 120]

In [131]:
df['age'].unique()

array([ 71.,  47.,  87.,  90.,  73.,  37.,  59.,  75.,  97.,  78.,  67.,
        82.,  85.,  86.,  88.,  92.,  55., 102.,  96.,  93., 101.,  25.,
        21.,  74.,  76.,  72.,  83.,  95.,  45.,  62.,  68.,  94., 100.,
        84.,  91.,  52.,  79.,  34.,  58.,  66.,  99.,  60.,  16.,  80.,
        81.,  15.,  53.,  70.,  89.,  26.,  61.,  64.,  77.,  31.,  14.,
        44.,  98., 112.,  43.,  56.,  33.,  57.,  65.,  69.,  48.,  54.,
        23.,  40.,  49.,  50.,  35.,  32.,  38.,  22.,  51.,  28.,  63.,
        20., 103.,  24.,  39., 107.,  30.,  41.,  46.,  11.,  42.,  12.,
        29.,  27.,  36., 104.,  19.,   9.,  17., 110., 116., 109., 106.,
       113.,  18., 105.,   7., 108.])

In [132]:
df.shape, df.dtypes

((6481, 2),
 dateofdeath    datetime64[ns]
 age                   float64
 dtype: object)

In [123]:
# 6 Turn the age column into an integer value
df['age'] = df['age'].astype(int)
df.dtypes

dateofdeath    datetime64[ns]
age                     int64
dtype: object

In [135]:
# (7) Find the average age of celebrities who died during that period from Feb - July
average_age = df.loc[2:7, 'age'].mean()
print(f'Average age of celebrities who died from Feb to July: {average_age:.2f}')

Average age of celebrities who died from Feb to July: 77.18


### 26.2 What was the average age of death from Feb. 15 through July 15?

In [10]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

# Make the month column the index of the data frame
df['dateofdeath'] = pd.to_datetime(df['dateofdeath'])
df['month'] = df['dateofdeath'].dt.month
df = df.set_index('month')

# Sort the data frame by the index
df = df.sort_index()

# Convert the age column to numeric type, coercing errors to NaN
df['age'] = pd.to_numeric(df['age'], errors='coerce')
# Remove all NaN values from the age column
df = df.dropna(subset=['age'])
# Remove all ages of 120 and above
df = df[df['age'] < 120]

df.head()

Unnamed: 0_level_0,dateofdeath,age
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2016-01-01,71.0
1,2016-01-21,47.0
1,2016-01-21,87.0
1,2016-01-21,90.0
1,2016-01-21,73.0


In [11]:
# Add a new column, day, from the day of the month in which the celebrity died
df['day'] = df['dateofdeath'].dt.day

# Reset the index to include the day column
df = df.reset_index()

# Then create a multi-index (from month and day)
df = df.set_index(['month', 'day'])
df = df.sort_index()

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dateofdeath,age
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2016-01-01,71.0
1,1,2016-01-01,93.0
1,1,2016-01-01,74.0
1,1,2016-01-01,79.0
1,1,2016-01-01,45.0


In [12]:
# What was the average age of death from Feb. 15 through July 15?
feb_15 = (2, 15)
jul_15 = (7, 15)
average_age = df.loc[feb_15:jul_15, 'age'].mean()
print(f'Average age of death from Feb. 15 through July 15: {average_age:.2f}')

Average age of death from Feb. 15 through July 15: 77.05


### 26.3 Find the five most common causes of death

In [13]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age', 'causeofdeath']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

df.head()

Unnamed: 0,dateofdeath,age,causeofdeath
0,2016-01-01,71,brain cancer
1,2016-01-01,74,cancer
2,2016-01-01,79,cancer
3,2016-01-01,45,complications of a stroke
4,2016-01-01,83,heart failure


In [21]:
# Find the five most common causes of death
top_causes = df['causeofdeath'].value_counts().head(5)
print("Top 5 causes of death:")
for cause, count in top_causes.items():
    print(f"{cause:<19}: {count:>3}")

Top 5 causes of death:
 cancer            : 248
 heart attack      : 125
 traffic collision :  56
 lung cancer       :  51
 pneumonia         :  50


In [26]:
# Check the NaN values in the causeofdeath column
df['causeofdeath'].value_counts(dropna=False, normalize=False). head(5), df.shape

(causeofdeath
 NaN                   5008
  cancer                248
  heart attack          125
  traffic collision      56
  lung cancer            51
 Name: count, dtype: int64,
 (6543, 3))

In [25]:
# Check the NaN values in the causeofdeath column
df['causeofdeath'].value_counts(dropna=False, normalize=True). head(5)

causeofdeath
NaN                   0.765398
 cancer               0.037903
 heart attack         0.019104
 traffic collision    0.008559
 lung cancer          0.007795
Name: proportion, dtype: float64

In [27]:
# Now replace any NaN values in that column with the string 'unknown', 
# and again find the five most common causes of death
df['causeofdeath'] = df['causeofdeath'].fillna('unknown')   
top_causes = df['causeofdeath'].value_counts().head(5)
print("Top 5 causes of death after replacing NaN with 'unknown':")
for cause, count in top_causes.items():
    print(f"{cause:<19}: {count:>3}")

Top 5 causes of death after replacing NaN with 'unknown':
unknown            : 5008
 cancer            : 248
 heart attack      : 125
 traffic collision :  56
 lung cancer       :  51


### 26.4 If someone asks whether cancer is in the top 10 causes, what would you say?

In [7]:
file_name = 'data/celebrity_deaths_2016.csv'
usecol = ['dateofdeath', 'age', 'causeofdeath']

df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

# Fill NaN values in the causeofdeath column with 'unknown'
df['causeofdeath'] = df['causeofdeath'].fillna('unknown')

df.head()

Unnamed: 0,dateofdeath,age,causeofdeath
0,2016-01-01,71,brain cancer
1,2016-01-01,74,cancer
2,2016-01-01,79,cancer
3,2016-01-01,45,complications of a stroke
4,2016-01-01,83,heart failure


In [13]:
# Compute the number of known causes of death
unknown_mask = df['causeofdeath'].str.contains('unknown')
known_cause_count = df[~unknown_mask].shape[0]
unknown_cause_count = df[unknown_mask].shape[0]
total_count = df.shape[0]

# Compute the number of 'cancer' causes of death
cancer_narrow_count =df['causeofdeath'].isin(['cancer']).sum()
cancer_wide_count = df['causeofdeath'].str.contains('cancer', case=False).sum()

print(f"Total number of known causes of death: {known_cause_count}")
print(f"Total number of unknown causes of death: {unknown_cause_count}")
print(f"Total number of causes of death: {total_count}")
print(f"\nNumber of causes of death that are exactly 'cancer': {cancer_narrow_count}")
print(f"Number of causes of death that contain 'cancer': {cancer_wide_count}")

# Compute cancer_wide_count in percentage of known_cause_count
cancer_wide_percentage = (cancer_wide_count / known_cause_count) * 100
print(f"\nPercentage of causes of death that contain 'cancer': {cancer_wide_percentage:.2f}%")

Total number of known causes of death: 1535
Total number of unknown causes of death: 5008
Total number of causes of death: 6543

Number of causes of death that are exactly 'cancer': 8
Number of causes of death that contain 'cancer': 529

Percentage of causes of death that contain 'cancer': 34.46%


## EXERCISE 27. Titanic interpolation

### 27.1 Data exploration

In [15]:
# Load the Titanic dataset from an Excel file
file_name = 'data/titanic3.xls'
df = pd.read_excel(file_name)

df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [17]:
# Determine which columns contain null values
nan_counts = df.isna().sum()
mask = nan_counts > 0
nan_counts[mask].sort_values(ascending=False)

body         1188
cabin        1014
boat          823
home.dest     564
age           263
embarked        2
fare            1
dtype: int64

In [18]:
df.columns[mask]

Index(['age', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], dtype='object')

In [26]:
# A function to clean the titanic dataset
def clean_titanic_data(df):
    """    Cleans the Titanic dataset by removing rows with missing values in 'embarked' and 'fare' columns.
    Args:
        df (pd.DataFrame): The Titanic dataset.
    Returns:
        pd.DataFrame: The cleaned dataset with rows containing NaN in 'embarked' and 'fare' removed.
    """

    # Remove rows with any missing data for 'embarked' and 'fare'
    df = df.dropna(subset=['embarked', 'fare'])

    # Replace NaN values in 'age' with the mean age
    df.loc[:, 'age'] = df['age'].fillna(df['age'].mean())

    # Replace NaN values in 'home.dest' with the mode
    df.loc[:, 'home.dest'] = df['home.dest'].fillna(df['home.dest'].mode()[0])

    return df

In [27]:
# Load the Titanic dataset from an Excel file
file_name = 'data/titanic3.xls'
df = pd.read_excel(file_name)

# Clean the Titanic dataset
df = clean_titanic_data(df)

In [28]:
# Determine which columns contain null values
nan_counts = df.isna().sum()
mask = nan_counts > 0
nan_counts[mask].sort_values(ascending=False)

body     1186
cabin    1013
boat      822
dtype: int64

### 27.2 Replace NaN values in the `home.dest` column with the most common value from that person’s embarked column

In [46]:
# Load the Titanic dataset from an Excel file
file_name = 'data/titanic3.xls'
df = pd.read_excel(file_name)

# Remove rows with any missing data for 'embarked' and 'fare'
# df = df.dropna(subset=['embarked', 'fare'])

In [47]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [48]:
df['embarked'].unique()

array(['S', 'C', nan, 'Q'], dtype=object)

In [49]:
# (1) Create a series (most_common_destinations) in which the index contains the
# unique values from the embarked column and the values are the most common
# destination for each value of embarked.
most_common_dest = df.groupby('embarked')['home.dest'].agg(lambda x: x.mode()[0])

most_common_dest

embarked
C           New York, NY
Q    Ireland Chicago, IL
S           New York, NY
Name: home.dest, dtype: object

In [50]:
# Let's do the same manually starting from an empty series
unique_embarked = df['embarked'].dropna().unique()
most_common_dest = pd.Series(index=unique_embarked, dtype='object')

for embarked in unique_embarked:

    # Get the most common destination for this embarked value
    mask = df['embarked'] == embarked
    max_value = df.loc[mask, 'home.dest'].mode()[0]

    # Assign the most common destination to the series
    most_common_dest[embarked] = max_value

most_common_dest

S           New York, NY
C           New York, NY
Q    Ireland Chicago, IL
dtype: object

In [55]:
# (2) (3) Use the most_common_destinations series to replace values in home.dest with
# the most common values for each embarkation point.
values_to_fill = df['embarked'].map(most_common_dest)
df['home.dest'] = df['home.dest'].fillna(values_to_fill)

In [56]:
df['home.dest'].isnull().sum()

np.int64(1)

## EXERCISE 28. Inconsistent data

In [85]:
file_name = 'data/nyc-parking-violations-2020.csv'
usecol = ['Plate ID',  'Registration State', 'Vehicle Make', 'Vehicle Color', 'Violation Time', 'Street Name']
df = pd.read_csv(file_name, usecols=usecol, low_memory=False)

In [86]:
df.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Violation Time,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,0523P,43 ST,BK
1,KRE6058,PA,ME/BE,0428P,UNION ST,BLK
2,444326R,NJ,LEXUS,0625A,CLERMONT AVENUE,BLACK
3,F728330,OH,CHEVR,1106A,DIVISION AVE,
4,FMY9090,NY,JEEP,1253A,GRAND ST,GREY


In [87]:
# Drop the column 'Violation Time'
df = df.drop(columns=['Violation Time'])

In [88]:
df.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,43 ST,BK
1,KRE6058,PA,ME/BE,UNION ST,BLK
2,444326R,NJ,LEXUS,CLERMONT AVENUE,BLACK
3,F728330,OH,CHEVR,DIVISION AVE,
4,FMY9090,NY,JEEP,GRAND ST,GREY


### 28.1 Data exploration

In [89]:
# 2 Determine how many different vehicle colors (the Vehicle Color column) there are.
df['Vehicle Color'].unique()

array(['BK', 'BLK', 'BLACK', ..., 'WHGY8', 'ORYW', 'CH'],
      shape=(1897,), dtype=object)

In [90]:
# Look at the 30 most common colors, and identify colors that appear multiple
# times but are written differently. For example, the color WHITE is also written WT,
# WT., and WHT.
df['Vehicle Color'].value_counts().head(30)

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
RD        483298
BLACK     465110
GREY      306787
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
BLK        91539
OTHER      60245
GREEN      58765
GL         54851
GRY        46527
MR         42812
GRAY       40854
WHT        35433
YELLO      32792
WHI        29760
OR         28100
BK.        27830
WT         25583
WT.        24593
Name: count, dtype: int64

In [91]:
# 4 Prepare a Python dict in which the keys represent the various color-name inputs
# and the values represent the values you want them to have in the end. I suggest
# using longer names, such as WHITE, rather than shorter ones.
colormap = {'WH': 'WHITE', 
          'GY':'GRAY', 
             'BK':'BLACK',
             'BL':'BLUE',
             'RD':'RED', 
             'GR':'GRAY',
             'TN':'TAN',
             'BR':'BROWN', 
             'YW':'YELLO', 
             'BLK':'BLACK',
             'GRY':'GRAY', 
             'WHT':'WHITE', 
             'WHI':'WHITE', 
             'OR':'ORANG',
             'BK.':'BLACK',
             'WT':'WHITE',
            'WT.':'WHITE'}

In [69]:
# 5 Replace the existing (old) colors with your translations. How many colors are
# there now?
df['Vehicle Color'] = df['Vehicle Color'].replace(colormap)

In [92]:
df['Vehicle Color'].unique()

array(['BK', 'BLK', 'BLACK', ..., 'WHGY8', 'ORYW', 'CH'],
      shape=(1897,), dtype=object)

### 28.2 Beyond 1

In [72]:
df['Vehicle Make'].unique(), df['Vehicle Make'].value_counts()

(array(['HONDA', 'ME/BE', 'LEXUS', ..., 'KASAK', 'Harle', 'KIA ('],
       shape=(5211,), dtype=object),
 Vehicle Make
 TOYOT    1395273
 HONDA    1343265
 FORD     1328063
 NISSA    1119587
 CHEVR     711464
           ...   
 BEAVE          1
 NELSO          1
 HOWBY          1
 BONEE          1
 KIA (          1
 Name: count, Length: 5210, dtype: int64)

In [94]:
def clean_name(one_string):

    if not isinstance(one_string, str):
        return one_string

    output = ''
    
    for one_character in one_string.strip().upper():
        if one_character in string.ascii_uppercase:
            output += one_character

    return output

In [98]:
# Try this function on a few examples from df['Vehicle Make'].unique()
for make in df['Vehicle Make'].unique()[:200]:
    clean_make = clean_name(make)
    if clean_make != make:
        # Only print if the name was changed
        print(f"{make:>20} -> {clean_name(make):<20}")

               ME/BE -> MEBE                
                 nan -> nan                 
               RA RO -> RARO                
               NS/OT -> NSOT                
               BL/BI -> BLBI                
                 H D -> HD                  
               MO/VE -> MOVE                
               BMW/I -> BMWI                
               TA/TA -> TATA                
               KI/MO -> KIMO                
                 L/R -> LR                  
               NE/FL -> NEFL                
                 M-B -> MB                  
               RA/RO -> RARO                


In [99]:
# Apply the clean_name function to the Vehicle Make column
unique_count_before = df['Vehicle Make'].nunique()
df['Vehicle Make'] = df['Vehicle Make'].apply(clean_name)
unique_count_after = df['Vehicle Make'].nunique()
print(f"Unique vehicle makes before cleaning: {unique_count_before}")
print(f"Unique vehicle makes after cleaning: {unique_count_after}")

Unique vehicle makes before cleaning: 5210
Unique vehicle makes after cleaning: 4915


### 28.3 Beyond 2

In [102]:
# For example, it sometimes says E 110th St and sometimes says E 110 ST
s = df['Street Name'].dropna()

In [103]:
s[s.str.contains('110')].value_counts().head(10)

Street Name
W 110th St              2970
110th St                2388
E 110th St              2048
WB 110TH AVE/BRINKER     922
110th Ave                704
110 ST                    94
110th Rd                  93
W 110 ST                  87
E 110 ST                  71
SB 110TH ST @ 67TH D      65
Name: count, dtype: int64

In [104]:
s[s.str.contains('BWAY') | s.str.contains('BROADWAY')].value_counts().head(10)

Street Name
SB BROADWAY @ 252ND     21939
NB BROADWAY @ W 228T    13367
BROADWAY                10771
SB BROADWAY @ W 196T     6623
NB BROADWAY @ W 120T     5691
NB BROADWAY @ W 196T     2594
BROADWAY (S/B) @ SHE     1456
NB BROADWAY AVE @ W       963
EB BROADWAY AVE @ 63      885
NB BROADWAY @ WINEGA      431
Name: count, dtype: int64

### Beyond 3

In [105]:
df['Registration State'].value_counts()

Registration State
NY    9753643
NJ    1096110
PA     338779
FL     174056
CT     165205
       ...   
PE         18
SK          8
MX          7
NT          3
YT          2
Name: count, Length: 68, dtype: int64