CHAPTER 5

In [4]:
import pandas as pd

In [2]:
path = "data/nyc-parking-violations-2020.csv"
df = pd.read_csv(path, usecols=["Plate ID", "Registration State", "Vehicle Make", "Vehicle Color", "Violation Time", "Street Name"])

In [5]:
# rows total
total = len(df.index)
total

12495734

In [6]:
# rows without any nans
without_any_nans = len(df.dropna().index)
without_any_nans

12048375

In [7]:
# diff with 100$ cost
(total - without_any_nans) * 100


44735900

In [9]:
# loss with subset features
subset = ["Plate ID", "Registration State", "Vehicle Make", "Street Name"]
print(f'Loss: {(len(df) - len(df.dropna(subset=subset))) * 100}')

subset = ["Plate ID", "Registration State", "Street Name"]
print(f'Loss without car make: {(len(df.index) - len(df.dropna(subset=subset).index)) * 100}')

Loss: 6378500
Loss without car make: 161800


In [20]:
'''
How many rows would you eliminate if you required at least three non-null values
from the four columns Plate ID, Registration State, Vehicle Make, and
Street Name
'''

rows_with_at_least_3_non_nans = len(
    df[
        (
            df['Plate ID'].notnull().astype(int) + 
            df['Registration State'].notnull().astype(int) + 
            df['Vehicle Make'].notnull().astype(int) + 
            df['Street Name'].notnull().astype(int)
        ) >= 3
    ]
)
total - rows_with_at_least_3_non_nans

253

In [25]:
'''Which of the columns you've imported has the greatest number of NaN values'''
df.isnull().astype(int).sum()

Plate ID                 202
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64

In [26]:
'''
Null data is bad, but there is plenty of bad non-null data, too. For example,
many cars with BLANKPLATE as a plate ID were ticketed. Turn these into NaN val-
ues, and rerun the previous query.
'''
df['Plate ID'] = df['Plate ID'].replace('BLANKPLATE', pd.NA)
df.isnull().astype(int).sum()

Plate ID                9084
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64

In [8]:
df = None

2. The goal of this exercise is to find the average age of celebrities who died February–July 2016

In [23]:
path = "data/celebrity_deaths_2016.csv"
df = pd.read_csv(path, usecols=["dateofdeath", "age"], parse_dates=["dateofdeath"])

In [24]:
# add new column with month
df["month"] = df["dateofdeath"].map(lambda x: x.month)

In [25]:
# change index to month
df.reset_index(inplace=True)
df = df.set_index(keys=['month'], drop=True).drop(columns=["index"])

In [26]:
# sort df by index
df.sort_index(inplace=True)

In [27]:
# clean all nonintegers in age column
df["age"] = df["age"].replace(r"[a-zA-Z/ ._-]*", "", regex=True).str.slice(0, 2)

df["age"] = pd.to_numeric(df["age"], errors='coerce')
df.dropna(inplace=True)

# age to int
df["age"] = df["age"].astype(int)

In [28]:
# find avg age from feb to july
df.loc[2:7, "age"].mean()

np.float64(75.30396873120866)

In [34]:
# mean age value in period [2016-02-15, 2016-07-15]
df.reset_index(inplace=True, drop=False)
df.set_index(keys=["dateofdeath"], drop=True, inplace=True)
df.drop(columns=["index"], inplace=True)

In [38]:
import datetime as dt
start = df.index.searchsorted(dt.datetime(2016, 2, 15))
end = df.index.searchsorted(dt.datetime(2016, 7, 15))

In [None]:
df[start:end]['age'].mean()

In [48]:
df = pd.read_csv(path, usecols=["dateofdeath", "age", "causeofdeath"], parse_dates=["dateofdeath"])

In [50]:
#top 5 cause of death
df["causeofdeath"].value_counts()[:5]

causeofdeath
cancer               248
heart attack         125
traffic collision     56
lung cancer           51
pneumonia             50
Name: count, dtype: int64

In [53]:
#top 5 cause of death with unknown
df["causeofdeath"].replace(pd.NA, "unknown").value_counts()[:5]

causeofdeath
unknown               5008
 cancer                248
 heart attack          125
 traffic collision      56
 lung cancer            51
Name: count, dtype: int64

In [58]:
df=None

3. Fill in missing data from the famous Titanic data set

In [3]:
%pip install xlrd


Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [46]:
path = "data/titanic3.xls"
df = pd.read_excel(path)

# df.to_csv('data/titanic3.csv', index=False) # for arrow

In [47]:
# which columns contain null values
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [49]:
df["age"] = df["age"].interpolate()

In [50]:
df[df["fare"].isna()]  = df[df["fare"] < 400]['fare'].mean().astype(int)

In [51]:
df = df.dropna(subset=["embarked"])

In [52]:
# df["home.dest"] = df["home.dest"].fillna(df["home.dest"].mode()[0])


In [55]:
'''
replace NaN values in the home.dest column with the most common value 
from that person's embarked column
'''
most_common_destinations = pd.Series()

for name in df['embarked'].dropna().unique():
    a = df[df['embarked']==name]['home.dest'].value_counts()
    print(a)
    most_common_destinations.loc[name] = a
        
    
most_common_destinations      

home.dest
New York, NY                            29
London                                  14
Cornwall / Akron, OH                     9
Wiltshire, England Niagara Falls, NY     8
Sweden Winnipeg, MN                      7
                                        ..
Devonport, England                       1
Tokyo, Japan                             1
North Evington, England                  1
St Ives, Cornwall / Houghton, MI         1
Antwerp, Belgium / Stanton, OH           1
Name: count, Length: 293, dtype: int64
home.dest
New York, NY                             33
Paris, France                             7
Haverford, PA / Cooperstown, NY           5
Ottawa, ON                                5
Paris / Montreal, PQ                      4
                                         ..
?Havana, Cuba                             1
St James, Long Island, NY                 1
Gallipolis, Ohio / ? Paris / New York     1
Albany, NY                                1
Austria Niagara Falls, NY   

S     home.dest
New York, NY                        ...
C     home.dest
New York, NY                        ...
Q     home.dest
Ireland Chicago, IL                 ...
31          home.dest
31    1
Name: count, dtype: int64
dtype: object

In [150]:
df[['home.dest', 'embarked']]

Unnamed: 0,home.dest,embarked
0,"St Louis, MO",S
1,"Montreal, PQ / Chesterville, ON",S
2,"Montreal, PQ / Chesterville, ON",S
3,"Montreal, PQ / Chesterville, ON",S
4,"Montreal, PQ / Chesterville, ON",S
...,...,...
1304,,C
1305,,C
1306,,C
1307,,C


In [159]:
df['home.dest'] = df['home.dest'].fillna(df['embarked'])

In [152]:
df[['home.dest', 'embarked']]

Unnamed: 0,home.dest,embarked
0,"St Louis, MO",S
1,"Montreal, PQ / Chesterville, ON",S
2,"Montreal, PQ / Chesterville, ON",S
3,"Montreal, PQ / Chesterville, ON",S
4,"Montreal, PQ / Chesterville, ON",S
...,...,...
1304,C,C
1305,C,C
1306,C,C
1307,C,C


In [160]:
df['home.dest'] = df['home.dest'].replace(most_common_destinations)

In [161]:
df[['home.dest', 'embarked']]

Unnamed: 0,home.dest,embarked
0,"St Louis, MO",S
1,"Montreal, PQ / Chesterville, ON",S
2,"Montreal, PQ / Chesterville, ON",S
3,"Montreal, PQ / Chesterville, ON",S
4,"Montreal, PQ / Chesterville, ON",S
...,...,...
1304,"New York, NY",C
1305,"New York, NY",C
1306,"New York, NY",C
1307,"New York, NY",C


In [167]:
df = None

In [1]:
%pip install polars

Collecting polars
  Downloading polars-1.21.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Downloading polars-1.21.0-cp39-abi3-macosx_11_0_arm64.whl (28.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.0/28.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: polars
Successfully installed polars-1.21.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import polars as po

In [56]:
df = po.read_csv("data/titanic3.csv")

In [44]:
d1 = df.group_by("embarked", "home.dest").len().drop_nulls()
d2 = d1.select(po.col("embarked"), po.col("len")).group_by("embarked").max()

In [45]:
d1.join(d2, on=["len", "embarked"])

embarked,home.dest,len
str,str,u32
"""S""","""New York, NY""",29
"""Q""","""Ireland New York, NY""",4
"""Q""","""Ireland Chicago, IL""",4
"""C""","""New York, NY""",33


In [3]:
%pip install arrow
import pyarrow as pa

Note: you may need to restart the kernel to use updated packages.
