## Setup

In [141]:
import pandas as pd
import numpy as np

import requests
from io import StringIO

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

## EXERCISE 15. Weird taxi rides

In [2]:
df = pd.read_csv('data/nyc_taxi_2019-01.csv',
                usecols=['passenger_count', 'trip_distance',
                        'total_amount', 'payment_type'])

In [3]:
df.head()

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
0,1,1.5,1,9.95
1,1,2.6,1,16.3
2,3,0.0,1,5.8
3,5,0.0,2,7.55
4,5,0.0,2,55.55


In [6]:
df.shape

(7667792, 4)

In [13]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
passenger_count    0
trip_distance      0
payment_type       0
total_amount       0
dtype: int64


### 15.1. Taxi dataset description

In [4]:
# How many taxi rides had more than eight passengers?
mask_passengers = df['passenger_count'] > 8
num_rides = df[mask_passengers].shape[0]
print(f'Number of taxi rides with more than 8 passengers: {num_rides}')

Number of taxi rides with more than 8 passengers: 9


In [5]:
# How many taxi rides had zero passengers?
mask_zero_passengers = df['passenger_count'] == 0
num_zero_passengers = df[mask_zero_passengers].shape[0]
print(f'Number of taxi rides with zero passengers: {num_zero_passengers}')

Number of taxi rides with zero passengers: 117381


In [8]:
# How many taxi rides were paid for in cash and cost over $1,000?
mask_cash = (df['payment_type'] == 2) & (df['total_amount'] > 1000)
num_cash_over_1000 = df[mask_cash].shape[0]
print(f'Number of taxi rides paid in cash and costing over $1,000: {num_cash_over_1000}')

Number of taxi rides paid in cash and costing over $1,000: 5


In [10]:
# How many rides cost less than $0?
mask_cost_less_than_zero = df['total_amount'] < 0
num_cost_less_than_zero = df[mask_cost_less_than_zero].shape[0]
print(f'Number of taxi rides costing less than $0: {num_cost_less_than_zero}')

Number of taxi rides costing less than $0: 7131


In [11]:
# How many rides traveled a below-average distance but cost an above-average amount?
mean_distance = df['trip_distance'].mean()
mean_amount = df['total_amount'].mean()
mask_below_avg_distance = df['trip_distance'] < mean_distance
mask_above_avg_amount = df['total_amount'] > mean_amount
num_below_avg_distance_above_avg_amount = df[mask_below_avg_distance & mask_above_avg_amount].shape[0]
print(f'Number of taxi rides with below-average distance and above-average amount: {num_below_avg_distance_above_avg_amount}')

Number of taxi rides with below-average distance and above-average amount: 411255


### 15.2. Taxi dataset description with query

In [15]:
# How many taxi rides had more than eight passengers?
query_passengers = 'passenger_count > 8'
num_rides_query = df.query(query_passengers).shape[0]
print(f'Number of taxi rides with more than 8 passengers (using query): {num_rides_query}')

Number of taxi rides with more than 8 passengers (using query): 9


In [16]:
# How many taxi rides had zero passengers?
query_zero_passengers = 'passenger_count == 0'
num_zero_passengers_query = df.query(query_zero_passengers).shape[0]
print(f'Number of taxi rides with zero passengers (using query): {num_zero_passengers_query}')

Number of taxi rides with zero passengers (using query): 117381


In [17]:
# How many taxi rides were paid for in cash and cost over $1,000?
query_cash = 'payment_type == 2 and total_amount > 1000'
num_cash_over_1000_query = df.query(query_cash).shape[0]
print(f'Number of taxi rides paid in cash and costing over $1,000 (using query): {num_cash_over_1000_query}')

Number of taxi rides paid in cash and costing over $1,000 (using query): 5


In [18]:
# How many rides cost less than $0?
query_cost_less_than_zero = 'total_amount < 0'
num_cost_less_than_zero_query = df.query(query_cost_less_than_zero).shape[0]
print(f'Number of taxi rides costing less than $0 (using query): {num_cost_less_than_zero_query}')

Number of taxi rides costing less than $0 (using query): 7131


In [19]:
# How many rides traveled a below-average distance but cost an above-average amount?
mean_distance = df['trip_distance'].mean()
mean_amount = df['total_amount'].mean()
query_below_avg_distance_above_avg_amount = f'trip_distance < {mean_distance} and total_amount > {mean_amount}'
num_below_avg_distance_above_avg_amount_query = df.query(query_below_avg_distance_above_avg_amount).shape[0]
print(f'Number of taxi rides with below-average distance and above-average amount (using query): {num_below_avg_distance_above_avg_amount_query}')

Number of taxi rides with below-average distance and above-average amount (using query): 411255


### 15.3. How many rides that cost less than $0 involved either a dispute (payment_type of 4) or a voided trip (payment_type of 6)?

In [20]:
# How many rides that cost less than $0 involved either a dispute (payment_type of 4) 
# or a voided trip (payment_type of 6)?
mask_cost_less_than_zero = df['total_amount'] < 0
mask_dispute_or_voided = (df['payment_type'] == 4) | (df['payment_type'] == 6)
num_dispute_or_voided = df[mask_cost_less_than_zero & mask_dispute_or_voided].shape[0]
print(f'Number of rides that cost less than $0 and involved either a dispute or a voided trip: {num_dispute_or_voided}')

Number of rides that cost less than $0 and involved either a dispute or a voided trip: 2666


In [21]:
df.loc[mask_cost_less_than_zero & mask_dispute_or_voided, 'payment_type'].value_counts()

payment_type
4    2666
Name: count, dtype: int64

### 15.4. What percentage normally pays in cash versus a credit card?

In [22]:
df['payment_type'].value_counts()

payment_type
1    5486027
2    2137415
3      33186
4      11164
Name: count, dtype: int64

In [23]:
df['payment_type'].value_counts(normalize=True) * 100 

payment_type
1    71.546372
2    27.875234
3     0.432797
4     0.145596
Name: proportion, dtype: float64

In [24]:
df['payment_type'].value_counts(normalize=True)[[1, 2]] * 100  # Cash vs Credit Card

payment_type
1    71.546372
2    27.875234
Name: proportion, dtype: float64

## EXERCISE 16. Pandemic taxis

In [2]:
df_2019_jul = pd.read_csv('data/nyc_taxi_2019-07.csv',
                usecols=['passenger_count', 
                        'total_amount', 'payment_type'])
df_2019_jul['year'] = 2019

df_2020_jul = pd.read_csv('data/nyc_taxi_2020-07.csv',
                usecols=['passenger_count', 
                        'total_amount', 'payment_type'])
df_2020_jul['year'] = 2020

df = pd.concat([df_2019_jul, df_2020_jul])

In [3]:
df.shape

(7110831, 4)

In [4]:
df.head()

Unnamed: 0,passenger_count,payment_type,total_amount,year
0,1.0,1.0,4.94,2019
1,1.0,2.0,20.3,2019
2,1.0,1.0,70.67,2019
3,1.0,1.0,66.36,2019
4,0.0,1.0,15.3,2019


### 16.1. Comparison of datasets for July 2019 and 2020

In [5]:
# How many rides were taken in 2019 and 2020, and what is the difference between these two figures?
df['year'].value_counts()

year
2019    6310419
2020     800412
Name: count, dtype: int64

In [6]:
mask_2019 = df['year'] == 2019
mask_2020 = df['year'] == 2020

In [7]:
rides_2019 = df[df['year'] == 2019].shape[0]
rides_2020 = df[df['year'] == 2020].shape[0]
difference = rides_2019 - rides_2020
print(f'Number of rides in 2019: {rides_2019:,}')
print(f'Number of rides in 2020: {rides_2020:,}')
print(f'Difference in rides between 2019 and 2020: {difference:,}')

Number of rides in 2019: 6,310,419
Number of rides in 2020: 800,412
Difference in rides between 2019 and 2020: 5,510,007


In [8]:
# How much money (in total) was collected in 2019 and 2020, and what was the difference between these two figures?
total_amount_2019 = df.loc[df['year'] == 2019, 'total_amount'].sum()
total_amount_2020 = df.loc[df['year'] == 2020, 'total_amount'].sum()
difference_amount = total_amount_2019 - total_amount_2020
print(f'Total amount collected in 2019: ${total_amount_2019:,.0f}')
print(f'Total amount collected in 2020: ${total_amount_2020:,.0f}')
print(f'Difference in total amount between 2019 and 2020: ${difference_amount:,.0f}')

Total amount collected in 2019: $123,761,823
Total amount collected in 2020: $14,912,844
Difference in total amount between 2019 and 2020: $108,848,979


In [9]:
# Did the proportion of trips with more than one passenger change dramatically?
df.loc[mask_2019, 'passenger_count'].value_counts(normalize=True)

passenger_count
1.0    0.697987
2.0    0.151958
3.0    0.044481
5.0    0.040510
6.0    0.024171
4.0    0.022259
0.0    0.018623
7.0    0.000005
8.0    0.000004
9.0    0.000003
Name: proportion, dtype: float64

In [10]:
df.loc[mask_2020, 'passenger_count'].value_counts(normalize=True)

passenger_count
1.0    0.767402
2.0    0.123243
3.0    0.029527
0.0    0.026446
5.0    0.023194
6.0    0.019587
4.0    0.010589
7.0    0.000007
8.0    0.000003
9.0    0.000001
Name: proportion, dtype: float64

In [11]:
proportion_more_than_one_passenger_2019 = 1 - df.loc[mask_2019, 'passenger_count'].value_counts(normalize=True)[1]
proportion_more_than_one_passenger_2020 = 1 - df.loc[mask_2020, 'passenger_count'].value_counts(normalize=True)[1]
print(f'Proportion of trips with more than one passenger in 2019: {proportion_more_than_one_passenger_2019*100:.2f}%')
print(f'Proportion of trips with more than one passenger in 2020: {proportion_more_than_one_passenger_2020*100:.2f}%')  
change = proportion_more_than_one_passenger_2020 - proportion_more_than_one_passenger_2019
print(f'Change in proportion of trips with more than one passenger from 2019 to 2020: {change:.2f}%')

Proportion of trips with more than one passenger in 2019: 30.20%
Proportion of trips with more than one passenger in 2020: 23.26%
Change in proportion of trips with more than one passenger from 2019 to 2020: -0.07%


In [12]:
# Did people use cash (i.e., payment_type of 2) less in 2020 than in 2019?
df.loc[mask_2019, 'payment_type'].value_counts(normalize=True)

payment_type
1.0    0.704883
2.0    0.287060
3.0    0.005654
4.0    0.002404
Name: proportion, dtype: float64

In [13]:
df.loc[mask_2020, 'payment_type'].value_counts(normalize=True)

payment_type
1.0    0.665705
2.0    0.320559
3.0    0.009245
4.0    0.004490
Name: proportion, dtype: float64

### 16.2. Use the corr method on df to find the correlations among the columns. How would you interpret these results?

In [14]:
df.corr()

Unnamed: 0,passenger_count,payment_type,total_amount,year
passenger_count,1.0,0.01641,0.014943,-0.049558
payment_type,0.01641,1.0,-0.138561,0.029277
total_amount,0.014943,-0.138561,1.0,-0.019706
year,-0.049558,0.029277,-0.019706,1.0


### 16.3. Difference in descriptive statistics for total_amount between 2019 and 2020.

In [17]:
df.loc[mask_2019, 'total_amount'].describe().round(2) - df.loc[mask_2020, 'total_amount'].describe().round(2)

count    5510007.00
mean           0.98
std            0.75
min          -53.20
25%            0.50
50%            0.60
75%            0.75
max         4672.45
Name: total_amount, dtype: float64

### 16.4. Zero-passenger trips

In [25]:
# How many zero-passenger trips were there in 2019 and 2020?
(df.loc[mask_2019, 'passenger_count'].value_counts(normalize=True)[0],
df.loc[mask_2020, 'passenger_count'].value_counts(normalize=True)[0])

(np.float64(0.018622599363335383), np.float64(0.026446482682882185))

## EXERCISE 17. Setting column types

### 17.1 Assigning dtypes

In [26]:
df = pd.read_csv('data/nyc_taxi_2020-01.csv',
                usecols=['passenger_count',
                         'total_amount' , 
                         'payment_type'],
                dtype={'passenger_count': np.float64, 
                       'total_amount': np.float64, 
                       'payment_type': np.float64})

In [27]:
df.shape

(6405008, 3)

In [28]:
df.head()

Unnamed: 0,passenger_count,payment_type,total_amount
0,1.0,1.0,11.27
1,1.0,1.0,12.3
2,1.0,1.0,10.8
3,1.0,1.0,8.16
4,1.0,2.0,4.8


In [32]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
passenger_count    65441
payment_type       65441
total_amount           0
dtype: int64


In [31]:
# Identify rows containing NaN values
df[df.isnull().any(axis=1)]

Unnamed: 0,passenger_count,payment_type,total_amount
6339567,,,54.60
6339568,,,30.11
6339569,,,27.91
6339570,,,29.63
6339571,,,28.83
...,...,...,...
6405003,,,21.14
6405004,,,62.46
6405005,,,51.90
6405006,,,30.22


In [35]:
# Drop rows with NaN values
df_cleaned = df.dropna()
print(f'Number of rows after dropping NaN values: {df_cleaned.shape[0]}')

# Check for missing values after dropping
missing_values_cleaned = df_cleaned.isnull().sum()
print("Missing values after dropping rows with NaN:")
print(missing_values_cleaned)

Number of rows after dropping NaN values: 6339567
Missing values after dropping rows with NaN:
passenger_count    0
payment_type       0
total_amount       0
dtype: int64


In [36]:
# Set dtype for passenger_count and payment_type to int
df['passenger_count'] = df['passenger_count'].astype('Int64')
df['payment_type'] = df['payment_type'].astype('Int8')

In [37]:
df.dtypes

passenger_count      Int64
payment_type          Int8
total_amount       float64
dtype: object

### 17.2-3 Data frame from four other columns

In [42]:
df = pd.read_csv('data/nyc_taxi_2020-01.csv',
                usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],
                dtype={'VendorID':np.float32,
                       'trip_distance':np.float32, 
                       'tip_amount':np.float32,
                       'total_amount':np.float32})

In [43]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
VendorID         65441
trip_distance        0
tip_amount           0
total_amount         0
dtype: int64


In [44]:
# Get unique values in VendorID
unique_vendor_ids = df['VendorID'].unique()
print(f'Unique VendorIDs: {unique_vendor_ids}')

Unique VendorIDs: [ 1.  2. nan]


In [49]:
# Drop rows with NaN values
# df = df.dropna().copy()

# Fill NaN values with 3.0
df['VendorID'] = df['VendorID'].fillna(3.0)

# Check for missing values after filling
missing_values_filled = df['VendorID'].isnull().sum()
print(f"Missing values in VendorID: {missing_values_filled}")

Missing values in VendorID: 0


In [50]:
# Set the int type for VendorID to Int8
df['VendorID'] = df['VendorID'].astype('Int8')

### 17.4 Memory usage

In [51]:
df = pd.read_csv('data/nyc_taxi_2020-01.csv',
                usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],
                dtype={'VendorID':np.float64,
                       'trip_distance':np.float64, 
                       'tip_amount':np.float64,
                       'total_amount':np.float64})

In [64]:
df.head()

Unnamed: 0,VendorID,trip_distance,tip_amount,total_amount
0,1.0,1.2,1.47,11.27
1,1.0,1.2,1.5,12.3
2,1.0,0.6,1.0,10.8
3,1.0,0.8,1.36,8.16
4,2.0,0.0,0.0,4.8


In [57]:
memory_usage_float_64 = df.memory_usage().sum() / 1024**2  # Memory usage in MB
print(f'Memory usage with float64: {memory_usage_float_64:,.0f} MB')

Memory usage with float64: 195 MB


In [63]:
# Get the maximum value in all columns
max_values = df.max()
print("Maximum values in each column:")
for col, max_val in max_values.items():
    print(f'{col}: {max_val:,.2f}')

Maximum values in each column:
VendorID: 2.00
trip_distance: 210,240.07
tip_amount: 1,100.00
total_amount: 4,268.30


In [70]:
# Get the maximum values for numpy integer types
types = [np.int8, np.int16, np.int32, np.int64]
for t in types:
    max_value = np.iinfo(t).max
    print(f'Maximum value for {t.__name__}: {max_value:,}')

# Get the maximum values for numpy float types
print("\nMaximum values for numpy float types:")
float_types = [np.float16, np.float32, np.float64]
for t in float_types:
    max_value = np.finfo(t).max
    print(f'Maximum value for {t.__name__}: {max_value:,.2f}')

Maximum value for int8: 127
Maximum value for int16: 32,767
Maximum value for int32: 2,147,483,647
Maximum value for int64: 9,223,372,036,854,775,807

Maximum values for numpy float types:
Maximum value for float16: 65,504.00
Maximum value for float32: 340,282,346,638,528,859,811,704,183,484,516,925,440.00
Maximum value for float64: 179,769,313,486,231,570,814,527,423,731,704,356,798,070,567,525,844,996,598,917,476,803,157,260,780,028,538,760,589,558,632,766,878,171,540,458,953,514,382,464,234,321,326,889,464,182,768,467,546,703,537,516,986,049,910,576,551,282,076,245,490,090,389,328,944,075,868,508,455,133,942,304,583,236,903,222,948,165,808,559,332,123,348,274,797,826,204,144,723,168,738,177,180,919,299,881,250,404,026,184,124,858,368.00


In [72]:
# Drop rows with NaN values
df_cleaned = df.dropna().copy()

# Set an appropriate integer type for all columns
df_cleaned['VendorID'] = df_cleaned['VendorID'].astype('Int8')
df_cleaned['trip_distance'] = df_cleaned['trip_distance'].astype('float32')
df_cleaned['tip_amount'] = df_cleaned['tip_amount'].astype('float16')
df_cleaned['total_amount'] = df_cleaned['total_amount'].astype('float16')

In [76]:
memory_usage_optimized = df_cleaned.memory_usage().sum() / 1024**2  # Memory usage in MB
print(f'Memory usage with float64: {memory_usage_optimized:,.0f} MB')

Memory usage with float64: 109 MB


In [78]:
# Compare memory usage before and after optimization for VendorID
memory_usage_float_64_vendor = df['VendorID'].memory_usage() / 1024**2  # Memory usage in MB
memory_usage_optimized_vendor = df_cleaned['VendorID'].memory_usage() / 1024**2  # Memory usage in MB
print(f'Memory usage for VendorID with float64: {memory_usage_float_64_vendor:,.2f} MB')
print(f'Memory usage for VendorID with Int8: {memory_usage_optimized_vendor:,.2f} MB')

Memory usage for VendorID with float64: 48.87 MB
Memory usage for VendorID with Int8: 60.46 MB


## EXERCISE 18. `passwd` to df

### 18.1 Read `passwd` into a dataframe

In [83]:
file_name = 'data/linux-etc-passwd.txt'
separator = ':' # Separate fields with a colon
comment = '#' # Skip lines starting with this character
num_lines_to_skip = 2  # Skip the first 2 lines
header_line = None # None means no header line
column_names = ['username', 'password', 'userid', 'groupid', 'name', 'homedir', 'shell'] # Column names to use
index_column = 'username' # Set the index column to 'username'

In [84]:
df = pd.read_csv(
    filepath_or_buffer=file_name,
    sep=separator,
    comment=comment,
    # skiprows=num_lines_to_skip,
    header=header_line,
    names=column_names,
    index_col=index_column
)

In [85]:
df.head()

Unnamed: 0_level_0,password,userid,groupid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
root,x,0,0,root,/root,/bin/bash
daemon,x,1,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,x,2,2,bin,/bin,/usr/sbin/nologin
sys,x,3,3,sys,/dev,/usr/sbin/nologin
sync,x,4,65534,sync,/bin,/bin/sync


### 18.2 Ignore the password and groupid fields so they don’t appear in the data frame

In [96]:
file_name = 'data/linux-etc-passwd.txt'
separator = ':' # Separate fields with a colon
comment = '#' # Skip lines starting with this character
num_lines_to_skip = 2  # Skip the first 2 lines
header_line = None # None means no header line
column_names = ['username', 'password', 'userid', 'groupid', 'name', 'homedir', 'shell'] # Column names to use
index_column = 'username' # Set the index column to 'username'

In [97]:
columns_to_drop = ['password', 'groupid']  # Columns to drop

In [98]:
df = pd.read_csv(
    filepath_or_buffer=file_name,
    sep=separator,
    comment=comment,
    # skiprows=num_lines_to_skip,
    header=header_line,
    names=column_names,
    index_col=index_column
)

In [99]:
df = df.drop(columns=columns_to_drop)

In [100]:
df.head()

Unnamed: 0_level_0,userid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
root,0,root,/root,/bin/bash
daemon,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,2,bin,/bin,/usr/sbin/nologin
sys,3,sys,/dev,/usr/sbin/nologin
sync,4,sync,/bin,/bin/sync


In [106]:
usecolumn = [col for col in column_names if col not in columns_to_drop ]
usecolumn

['username', 'userid', 'name', 'homedir', 'shell']

In [107]:
df = pd.read_csv(
    filepath_or_buffer=file_name,
    sep=separator,
    comment=comment,
    # skiprows=num_lines_to_skip,
    header=header_line,
    usecols=usecolumn,
    names=column_names,
    index_col=index_column
)

In [108]:
df.head()

Unnamed: 0_level_0,userid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
root,0,root,/root,/bin/bash
daemon,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,2,bin,/bin,/usr/sbin/nologin
sys,3,sys,/dev,/usr/sbin/nologin
sync,4,sync,/bin,/bin/sync


### 18.3 Nonspecial usernames

In [118]:
# Unix systems typically reserve user IDs below 1000 to special accounts. Show the
# nonspecial usernames in this passwd file.
mask_nonspecial = df['userid'] >= 1000
nonspecial_usernames = df.loc[mask_nonspecial, 'userid']
for username, userid in nonspecial_usernames.items():
    print(f"username:{username}, userid:{userid}")

username:nobody, userid:65534
username:user, userid:1000
username:reuven, userid:1001
username:genadi, userid:1002
username:shira, userid:1003
username:atara, userid:1004
username:shikma, userid:1005
username:amotz, userid:1006
username:git, userid:1007
username:deploy, userid:1008


### 18.4 Different shells in this file

In [119]:
df.head()

Unnamed: 0_level_0,userid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
root,0,root,/root,/bin/bash
daemon,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,2,bin,/bin,/usr/sbin/nologin
sys,3,sys,/dev,/usr/sbin/nologin
sync,4,sync,/bin,/bin/sync


In [120]:
df['shell'].unique()

array(['/bin/bash', '/usr/sbin/nologin', '/bin/sync', '/bin/false',
       '/bin/sh', '/bin/nologin'], dtype=object)

In [121]:
df['shell'].drop_duplicates()

username
root                    /bin/bash
daemon          /usr/sbin/nologin
sync                    /bin/sync
syslog                 /bin/false
debian-spamd              /bin/sh
gitlab-redis         /bin/nologin
Name: shell, dtype: object

## EXERCISE 19. Bitcoin values

### 19.1 Prices for Bitcoin

In [132]:
url_bitcoin = 'https://api.blockchain.info/charts/market-price?format=csv'

df = pd.read_csv(filepath_or_buffer=url_bitcoin,
                 header=None,  # No header row in the CSV
                 names=['timestamp', 'price'],  # Assign column names
                 parse_dates=['timestamp']  # Parse the timestamp column as datetime
                 )

In [133]:
# Format the price column to two decimal places and add a comma as a thousands separator
pd.options.display.float_format = '{:,.2f}'.format

In [134]:
df.head()

Unnamed: 0,timestamp,price
0,2024-07-01,62676.36
1,2024-07-02,62845.12
2,2024-07-03,62028.38
3,2024-07-04,60145.84
4,2024-07-05,57034.05


In [135]:
df.shape, df.dtypes

((366, 2),
 timestamp    datetime64[ns]
 price               float64
 dtype: object)

In [137]:
# The closing price for the most recent trading day
closing_price = df['price'].iloc[-1]
print(f'Closing price for the most recent trading day: ${closing_price:,.2f}')

Closing price for the most recent trading day: $107,130.31


In [138]:
# The lowest historical price and the date of that price
min_price = df['price'].min()
min_price_date = df.loc[df['price'] == min_price, 'timestamp'].iloc[0]
print(f'Lowest historical price: ${min_price:,.2f} on {min_price_date.date()}')

Lowest historical price: $53,951.72 on 2024-09-07


In [139]:
# The highest historical price and the date of that price
max_price = df['price'].max()
max_price_date = df.loc[df['price'] == max_price, 'timestamp'].iloc[0]
print(f'Highest historical price: ${max_price:,.2f} on {max_price_date.date()}')

Highest historical price: $111,722.19 on 2025-05-23


### 19.2 Without assigning the downloaded data to an interim variable, can you return the current value? 

In [140]:
# Return the current value without assigning the downloaded data to an interim variable
pd.read_csv(filepath_or_buffer=url_bitcoin,
            header=None,  # No header row in the CSV
            names=['timestamp', 'price'],  # Assign column names
            parse_dates=['timestamp']  # Parse the timestamp column as datetime
            )['price'].iloc[-1]

np.float64(107130.31)

### 19.3 Retrieve 1 year of historical S&P 500 data from Yahoo Finance

In [152]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
tables = pd.read_html(url)# The third table is usually the main one

In [158]:
type(tables), len(tables)

for i, table in enumerate(tables):
    print(f"Table {i}: type={type(table)}, shape={table.shape}")

for i, table in enumerate(tables):
    print(f"Table {i}: {table.head()}")

Table 0: type=<class 'pandas.core.frame.DataFrame'>, shape=(1, 1)
Table 1: type=<class 'pandas.core.frame.DataFrame'>, shape=(1, 3)
Table 2: type=<class 'pandas.core.frame.DataFrame'>, shape=(214, 7)
Table 3: type=<class 'pandas.core.frame.DataFrame'>, shape=(9, 2)
Table 4: type=<class 'pandas.core.frame.DataFrame'>, shape=(8, 2)
Table 5: type=<class 'pandas.core.frame.DataFrame'>, shape=(13, 2)
Table 6: type=<class 'pandas.core.frame.DataFrame'>, shape=(2, 2)
Table 0:                                                    0
0  Largest economies in the world by GDP (nominal...
Table 1:                                                    0  \
0  > $20 trillion $10–20 trillion $5–10 trillion ...   

                                                   1  \
0  $750 billion – $1 trillion $500–750 billion $2...   

                                                   2  
0  $50–100 billion $25–50 billion $5–25 billion <...  
Table 2:   Country/Territory IMF[1][12]            World Bank[13]          

In [159]:
df = tables[2]  # The third table is usually the main one
df.head()

Unnamed: 0_level_0,Country/Territory,IMF[1][12],IMF[1][12],World Bank[13],World Bank[13],United Nations[14],United Nations[14]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


In [160]:
df.shape

(214, 7)

In [161]:
df.columns

MultiIndex([( 'Country/Territory', 'Country/Territory'),
            (        'IMF[1][12]',          'Forecast'),
            (        'IMF[1][12]',              'Year'),
            (    'World Bank[13]',          'Estimate'),
            (    'World Bank[13]',              'Year'),
            ('United Nations[14]',          'Estimate'),
            ('United Nations[14]',              'Year')],
           )

In [162]:
import re

# Clean the first level of the MultiIndex
new_level_0 = [re.sub(r'\[.*?\]', '', col[0]).strip() for col in df.columns]
# Keep the second level as is (or clean if you want)
new_level_1 = [col[1] for col in df.columns]

# Assign new MultiIndex to columns
df.columns = pd.MultiIndex.from_tuples(zip(new_level_0, new_level_1))

In [163]:
df.columns

MultiIndex([('Country/Territory', 'Country/Territory'),
            (              'IMF',          'Forecast'),
            (              'IMF',              'Year'),
            (       'World Bank',          'Estimate'),
            (       'World Bank',              'Year'),
            (   'United Nations',          'Estimate'),
            (   'United Nations',              'Year')],
           )

In [173]:
df.head()

Unnamed: 0_level_0,Country/Territory,IMF,IMF,World Bank,World Bank,United Nations,United Nations
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


In [197]:
data_columns = [('IMF', 'Forecast'), ('World Bank', 'Estimate'), ('United Nations', 'Estimate')]
imf_forecast_col = data_columns[0]
wb_estimate_col = data_columns[1]
un_estimate_col = data_columns[2]

dash_symbol = '—'

def check_for_dash(df, dash_symbol = '—'):

    for col in data_columns:
        contains_dash = df[col].str.contains('—').sum()
        print(f"{col[0]} - {col[1]}: {contains_dash} rows contain '-'")

print("Check for dash before replacement:")
check_for_dash(df, dash_symbol)

# Remove the dash symbol from the specified columns
for col in data_columns:
    df[col] = df[col].str.replace(dash_symbol, '0', regex=False)

print("\nCheck for dash after replacement:")
check_for_dash(df, dash_symbol)

Check for dash before replacement:
IMF - Forecast: 19 rows contain '-'
World Bank - Estimate: 12 rows contain '-'
United Nations - Estimate: 1 rows contain '-'

Check for dash after replacement:
IMF - Forecast: 0 rows contain '-'
World Bank - Estimate: 0 rows contain '-'
United Nations - Estimate: 0 rows contain '-'


In [201]:
# Convert the specified columns to integer type
for col in data_columns:
    df[col] = df[col].astype(np.float64)

# Check the data types of the columns
print("\nData types after conversion:")
print(df.dtypes)


Data types after conversion:
Country/Territory  Country/Territory     object
IMF                Forecast             float64
                   Year                  object
World Bank         Estimate             float64
                   Year                  object
United Nations     Estimate             float64
                   Year                  object
dtype: object


In [202]:
# Format Forecast and Estimate columns to add a comma as a thousands separator
pd.options.display.float_format = '{:,.0f}'.format 

In [203]:
df.head()

Unnamed: 0_level_0,Country/Territory,IMF,IMF,World Bank,World Bank,United Nations,United Nations
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,113795678,2025,105435540,2023,100834796,2022
1,United States,30507217,2025,27360935,2023,27720700,2023
2,China,19231705,[n 1]2025,17794782,[n 3]2023,17794782,[n 1]2023
3,Germany,4744804,2025,4456081,2023,4525704,2023
4,India,4187017,2025,3549919,2023,3575778,2023


## EXERCISE 20. Big cities

### 20.1 Data exploration

In [206]:
file_name = 'data/cities.json'
df = pd.read_json(file_name)

In [224]:
df.head(10)

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,41,-74,8405837,1,New York
1,Los Angeles,4.8%,34,-118,3884307,2,California
2,Chicago,-6.1%,42,-88,2718782,3,Illinois
3,Houston,11.0%,30,-95,2195914,4,Texas
4,Philadelphia,2.6%,40,-75,1553165,5,Pennsylvania
5,Phoenix,14.0%,33,-112,1513367,6,Arizona
6,San Antonio,21.0%,29,-98,1409019,7,Texas
7,San Diego,10.5%,33,-117,1355896,8,California
8,Dallas,5.6%,33,-97,1257676,9,Texas
9,San Jose,10.5%,37,-122,998537,10,California


In [211]:
df.shape, df['city'].nunique()

((1000, 7), 925)

In [215]:
# Find the repeated cities
mask = df['city'].duplicated()
df['city'][mask].value_counts()

city
Springfield     4
Lancaster       3
Lakewood        3
Columbus        2
Concord         2
               ..
San Marcos      1
Bellevue        1
Aurora          1
Apple Valley    1
Florence        1
Name: count, Length: 62, dtype: int64

In [209]:
# What are the mean and median populations for these 1,000 largest cities?
df['population'].mean(), df['population'].median()

(np.float64(131132.443), np.float64(68207.0))

In [221]:
df['population'].describe()[['mean', '50%']]

mean   131,132
50%     68,207
Name: population, dtype: float64

In [219]:
# Along these lines, if you remove the 50 most populous cities, what happens to
# the mean population? What happens to the median?
population_thershold = df['population'].nlargest(50).min()
print(f'Population threshold for the 50 most populous cities: {population_thershold:,}')

mask_population = df['population'] < population_thershold
df.loc[mask_population, 'population'].mean(), df.loc[mask_population, 'population'].median()

Population threshold for the 50 most populous cities: 379,577


(np.float64(87027.38736842106), np.float64(65796.0))

In [222]:
df.loc[mask_population, 'population'].describe()[['mean', '50%']]

mean   87,027
50%    65,796
Name: population, dtype: float64

In [225]:
# What is the northernmost city, and where does it rank?
max_latitude = df['latitude'].max()
northernmost_city = df[df['latitude'] == max_latitude]
northernmost_city

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
62,Anchorage,15.4%,61,-150,300950,63,Alaska


In [228]:
idx = df['latitude'].idxmax()
idx

62

In [230]:
df.loc[[idx]]

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
62,Anchorage,15.4%,61,-150,300950,63,Alaska


In [232]:
df.loc[[idx], ['city', 'rank']]

Unnamed: 0,city,rank
62,Anchorage,63


In [236]:
# Which state has the largest number of cities on this list?
df['state'].nunique(), df['state'].value_counts()[:5]

(51,
 state
 California       212
 Texas             83
 Florida           73
 Illinois          52
 Massachusetts     36
 Name: count, dtype: int64)

### 20.2 Mean and median changes in city size between 2000 and 2013

In [238]:
df.head()

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,41,-74,8405837,1,New York
1,Los Angeles,4.8%,34,-118,3884307,2,California
2,Chicago,-6.1%,42,-88,2718782,3,Illinois
3,Houston,11.0%,30,-95,2195914,4,Texas
4,Philadelphia,2.6%,40,-75,1553165,5,Pennsylvania


In [239]:
df.dtypes

city                         object
growth_from_2000_to_2013     object
latitude                    float64
longitude                   float64
population                    int64
rank                          int64
state                        object
dtype: object

In [240]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
city                        0
growth_from_2000_to_2013    0
latitude                    0
longitude                   0
population                  0
rank                        0
state                       0
dtype: int64


In [255]:
# Read the data from the json file
file_name = 'data/cities.json'
df = pd.read_json(file_name)

# Convert the growth_from_2000_to_2013 column into a floating-point number
growth_col = 'growth_from_2000_to_2013'

# Remove the percentage sign
df[growth_col] = df[growth_col].str.replace('%', '')

# Check for empty strings
df[growth_col] = df[growth_col].replace('', np.nan)

# Check for missing values after conversion
missing_values_after_conversion = df[growth_col].isnull().sum()
print(missing_values_after_conversion)

# Drop missing values
df[growth_col] = df[growth_col].dropna()

# Convert to float
df[growth_col] = df[growth_col].astype(float)

# Find the mean and median changes in city size between 2000 and 2013
df[growth_col].describe()[['mean', '50%']]

6


mean   23
50%    10
Name: growth_from_2000_to_2013, dtype: float64

### 20.3 How many cities had positive growth in this period, and how many had negative growth?

In [258]:
num_cities_positive_growth = df[df[growth_col] > 0].shape[0]
num_cities_negative_growth = df[df[growth_col] < 0].shape[0]
num_cities_with_zero_growth = df[df[growth_col] == 0].shape[0]
print(f'Number of cities with zero growth: {num_cities_with_zero_growth}')
print(f'Number of cities with positive growth: {num_cities_positive_growth}')
print(f'Number of cities with negative growth: {num_cities_negative_growth}')

Number of cities with zero growth: 5
Number of cities with positive growth: 847
Number of cities with negative growth: 142


In [262]:
# Find the city or cities with latitudes more than two standard deviations from the mean
mean_latitude = df['latitude'].mean()
std_latitude = df['latitude'].std()
mask_outliers = (df['latitude'] < mean_latitude - 2 * std_latitude) 
mask_outliers |= (df['latitude'] > mean_latitude + 2 * std_latitude)
outliers = df[mask_outliers]
outliers.shape

(52, 7)