## Setup

In [1]:
import pandas as pd
import numpy as np

import requests
from io import StringIO

import string

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

## EXERCISE 29. Longest taxi rides

### 29.1 Data Exploration

In [2]:
file_name = 'data/nyc_taxi_2019-01.csv'
use_cols=['passenger_count', 'trip_distance', 'total_amount']

df = pd.read_csv(file_name, usecols=use_cols)

df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [3]:
# Using a descending sort, find the average cost of the 20 longest (in distance) taxi rides in January 2019
df_longest_rides = df.sort_values(by='trip_distance', ascending=False).head(20)
df_longest_rides['total_amount'].mean().round(2).item()

290.01

In [4]:
# The same result can be achieved using the `nlargest` method
df_longest_rides = df.nlargest(20, 'trip_distance')
df_longest_rides['total_amount'].mean().round(2).item()

290.01

In [5]:
# Using an ascending sort, find the average cost of the 20 longest (in distance) taxi rides in January 2019
df_longest_rides = df.sort_values(by='trip_distance', ascending=True).tail(20)
df_longest_rides['total_amount'].mean().round(2).item()

290.01

In [6]:
# Sort by ascending passenger count and descending trip distance
df_sorted = df.sort_values(by=['passenger_count', 'trip_distance'], ascending=[True, False])

# What is the average price paid for the top 50 rides?
df_top_50 = df_sorted.head(50)
df_top_50['total_amount'].mean().round(2).item()

135.5

### 29.2 Beyond 1

In [7]:
df.head(), df.shape

(   passenger_count  trip_distance  total_amount
 0                1            1.5          9.95
 1                1            2.6         16.30
 2                3            0.0          5.80
 3                5            0.0          7.55
 4                5            0.0         55.55,
 (7667792, 3))

In [8]:
# In which five rides did people pay the most per mile? How far did people go on those trips?

# Remove rides with zero distance
mask_zero = df['trip_distance'] == 0
mask_non_zero = ~mask_zero
df = df[mask_non_zero]

# Calculate the price per mile
df['price_per_mile'] = df['total_amount'] / df['trip_distance']

# Sort by price per mile in descending order
df.sort_values(by='price_per_mile', ascending=False).head(5)[['total_amount', 'trip_distance', 'price_per_mile']]

Unnamed: 0,total_amount,trip_distance,price_per_mile
2499600,623261.66,2.4,259692.358333
478791,6667.45,0.1,66674.5
7099014,415.3,0.01,41530.0
6403254,322.3,0.01,32230.0
4136499,273.96,0.01,27396.0


### 29.3 Beyond 2

In [9]:
file_name = 'data/nyc_taxi_2019-01.csv'
use_cols=['passenger_count', 'trip_distance', 'total_amount']

df = pd.read_csv(file_name, usecols=use_cols)

df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [10]:
# Let’s assume that multipassenger rides are split evenly among the passengers.
# Given that assumption, in which 10 multipassenger rides did each individual
# pay the greatest amount?

# Add a new column with the price per passenger
df['price_per_passenger'] = df['total_amount'] / df['passenger_count']

# Filter for multipassenger rides
mask_multipassenger = df['passenger_count'] > 1
df_multipassenger = df[mask_multipassenger]

# Sort by price per passenger in descending order and get the top 10
df_multipassenger.sort_values(by='price_per_passenger', ascending=False).head(10)


Unnamed: 0,passenger_count,trip_distance,total_amount,price_per_passenger
2972145,2,19.9,589.96,294.98
3014027,2,16.6,560.76,280.38
3842620,2,110.04,515.82,257.91
7593395,2,83.61,449.32,224.66
149362,2,17.2,426.8,213.4
5726185,2,65.05,416.82,208.41
6857368,2,0.0,411.36,205.68
6496403,2,0.0,410.95,205.475
4751745,2,100.78,403.5,201.75
1154626,2,0.0,400.8,200.4


### 29.4 Beyond 3

In [14]:
# The same sorting of df_multipassenger but with ignore_index=True option
df_multipassenger.sort_values(by='price_per_passenger', ascending=False, ignore_index=True).loc[:10]

Unnamed: 0,passenger_count,trip_distance,total_amount,price_per_passenger
0,2,19.9,589.96,294.98
1,2,16.6,560.76,280.38
2,2,110.04,515.82,257.91
3,2,83.61,449.32,224.66
4,2,17.2,426.8,213.4
5,2,65.05,416.82,208.41
6,2,0.0,411.36,205.68
7,2,0.0,410.95,205.475
8,2,100.78,403.5,201.75
9,2,0.0,400.8,200.4


In [15]:
df_multipassenger.sort_values(by='price_per_passenger', ascending=False).reset_index(drop=True).loc[:10]

Unnamed: 0,passenger_count,trip_distance,total_amount,price_per_passenger
0,2,19.9,589.96,294.98
1,2,16.6,560.76,280.38
2,2,110.04,515.82,257.91
3,2,83.61,449.32,224.66
4,2,17.2,426.8,213.4
5,2,65.05,416.82,208.41
6,2,0.0,411.36,205.68
7,2,0.0,410.95,205.475
8,2,100.78,403.5,201.75
9,2,0.0,400.8,200.4


In [16]:
df_multipassenger.sort_values(by='price_per_passenger', ascending=False).reset_index(drop=False).loc[:10]

Unnamed: 0,index,passenger_count,trip_distance,total_amount,price_per_passenger
0,2972145,2,19.9,589.96,294.98
1,3014027,2,16.6,560.76,280.38
2,3842620,2,110.04,515.82,257.91
3,7593395,2,83.61,449.32,224.66
4,149362,2,17.2,426.8,213.4
5,5726185,2,65.05,416.82,208.41
6,6857368,2,0.0,411.36,205.68
7,6496403,2,0.0,410.95,205.475
8,4751745,2,100.78,403.5,201.75
9,1154626,2,0.0,400.8,200.4


## EXERCISE 30. Taxi ride comparison

### 30.1 Data exploration

In [19]:
filename = 'data/nyc_taxi_2019-01.csv'
use_cols = ['passenger_count', 'trip_distance', 'total_amount']

df = pd.read_csv(filename, usecols=use_cols)

In [20]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [27]:
# For each number of passengers, find the mean cost of a taxi ride
df_grouped = df.groupby('passenger_count')['total_amount'].mean().sort_values(ascending=True).reset_index()
df_grouped

Unnamed: 0,passenger_count,total_amount
0,6,15.437892
1,5,15.54694
2,3,15.604015
3,1,15.609601
4,4,15.650307
5,2,15.831294
6,0,18.663658
7,9,31.094444
8,7,48.278421
9,8,64.105517


In [30]:
# Sort the results again by increasing the number of passengers
df_grouped.sort_values(by='passenger_count', ascending=True).reset_index(drop=True)

Unnamed: 0,passenger_count,total_amount
0,0,18.663658
1,1,15.609601
2,2,15.831294
3,3,15.604015
4,4,15.650307
5,5,15.54694
6,6,15.437892
7,7,48.278421
8,8,64.105517
9,9,31.094444


In [None]:
# Create a new column, trip_distance_group, in which the values are short (< 2
# miles), medium ( 2 miles and 10 miles), and long (> 10 miles)
def trip_distance_group(distance):
    if distance < 2:
        return 'short'
    elif distance <= 10:
        return 'medium'
    else:
        return 'long'
    
df['trip_distance_group'] = df['trip_distance'].apply(trip_distance_group)

# What is the average number of passengers per trip length category?
df_grouped = df.groupby('trip_distance_group')['passenger_count'].mean()

# Sort this result from highest (most passengers) to lowest (fewest passengers)
df_grouped.sort_values(ascending=False).reset_index()

Unnamed: 0,trip_distance_group,passenger_count
0,long,1.590035
1,medium,1.576764
2,short,1.559943


### 30.2 Beyond 1

In [38]:
file_name = 'data/nyc_taxi_2019-01.csv'
use_cols = ['passenger_count', 'trip_distance', 'total_amount']
df_2019 = pd.read_csv(file_name, usecols=use_cols)
# Add a new column with the year
df_2019['year'] = 2019

file_name = 'data/nyc_taxi_2020-01.csv'
use_cols = ['passenger_count', 'trip_distance', 'total_amount']
df_2020 = pd.read_csv(file_name, usecols=use_cols)
# Add a new column with the year
df_2020['year'] = 2020

# Concatenate the two dataframes
df = pd.concat([df_2019, df_2020])

In [39]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,year
0,1.0,1.5,9.95,2019
1,1.0,2.6,16.3,2019
2,3.0,0.0,5.8,2019
3,5.0,0.0,7.55,2019
4,5.0,0.0,55.55,2019


In [40]:
# Use groupby to compare the average cost of a taxi in January from each of these two years
df_grouped = df.groupby('year')['total_amount'].mean()
df_grouped

year
2019    15.682222
2020    18.663149
Name: total_amount, dtype: float64

### 30.3 Beyond 2

In [42]:
# Create a two-level grouping, first by year and then by passenger_count
df_grouped = df.groupby(['year', 'passenger_count'])['total_amount'].mean()
df_grouped

year  passenger_count
2019  0.0                18.663658
      1.0                15.609601
      2.0                15.831294
      3.0                15.604015
      4.0                15.650307
      5.0                15.546940
      6.0                15.437892
      7.0                48.278421
      8.0                64.105517
      9.0                31.094444
2020  0.0                18.059724
      1.0                18.343110
      2.0                19.050504
      3.0                18.736862
      4.0                19.128092
      5.0                18.234443
      6.0                18.367962
      7.0                71.143103
      8.0                58.197059
      9.0                81.244211
Name: total_amount, dtype: float64

### Beyond 3

In [43]:
df.corr().sort_values('passenger_count')

Unnamed: 0,passenger_count,trip_distance,total_amount,year
year,-0.021602,0.00114,0.007657,1.0
total_amount,-0.000136,0.004331,1.0,0.007657
trip_distance,0.008974,1.0,0.004331,0.00114
passenger_count,1.0,0.008974,-0.000136,-0.021602


## EXERCISE 31. Tourist spending per country

### Toy Dataset

In [44]:
df = pd.DataFrame([
    {'product_id':23, 'name':'computer',
    'wholesale_price': 500,
    'retail_price':1000, 'sales':100,
    'department':'electronics'},
    {'product_id':96, 'name':'Python Workout',
    'wholesale_price': 35,
    'retail_price':75, 'sales':1000,
    'department':'books'},
    {'product_id':97, 'name':'Pandas Workout',
    'wholesale_price': 35,
    'retail_price':75, 'sales':500,
    'department':'books'},
    {'product_id':15, 'name':'banana',
    'wholesale_price': 0.5,
    'retail_price':1, 'sales':200,
    'department':'food'},
    {'product_id':87, 'name':'sandwich',
    'wholesale_price': 3,
    'retail_price':5, 'sales':300,
    'department': 'food'},
])

In [45]:
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,department
0,23,computer,500.0,1000,100,electronics
1,96,Python Workout,35.0,75,1000,books
2,97,Pandas Workout,35.0,75,500,books
3,15,banana,0.5,1,200,food
4,87,sandwich,3.0,5,300,food


In [None]:
products_df = pd. DataFrame([
    {'product_id':23, 'name':'computer',
    'wholesale_price': 500,
    'retail_price':1000,
    'department':'electronics'},
    {'product_id':96, 'name':'Python Workout',
    'wholesale_price': 35,
    'retail_price':75, 'department':'books'},
    {'product_id':97, 'name':'Pandas Workout',
    'wholesale_price': 35,
    'retail_price':75, 'department':'books'},
    {'product_id':15, 'name':'banana',
    'wholesale_price': 0.5,
    'retail_price':1, 'department':'food'},
    {'product_id':87, 'name':'sandwich',
    'wholesale_price': 3,
    'retail_price':5, 'department': 'food'},
])

In [47]:
sales_df = pd.DataFrame([
    {'product_id': 23, 'date':'2021-August-10',
    'quantity':1},
    {'product_id': 96, 'date':'2021-August-10',
    'quantity':5},
    {'product_id': 15, 'date':'2021-August-10',
    'quantity':3},
    {'product_id': 87, 'date':'2021-August-10',
    'quantity':2},
    {'product_id': 15, 'date':'2021-August-11',
    'quantity':1},
    {'product_id': 96, 'date':'2021-August-11',
    'quantity':1},
    {'product_id': 23, 'date':'2021-August-11',
    'quantity':2},
    {'product_id': 87, 'date':'2021-August-12',
    'quantity':2},
    {'product_id': 97, 'date':'2021-August-12',
    'quantity':6},
    {'product_id': 97, 'date':'2021-August-12',
    'quantity':1},
    {'product_id': 87, 'date':'2021-August-13',
    'quantity':2},
    {'product_id': 23, 'date':'2021-August-13',
    'quantity':1},
    {'product_id': 15, 'date':'2021-August-14',
    'quantity':2}
])

In [48]:
products_df

Unnamed: 0,product_id,name,wholesale_price,retail_price,department
0,23,computer,500.0,1000,electronics
1,96,Python Workout,35.0,75,books
2,97,Pandas Workout,35.0,75,books
3,15,banana,0.5,1,food
4,87,sandwich,3.0,5,food


In [49]:
sales_df

Unnamed: 0,product_id,date,quantity
0,23,2021-August-10,1
1,96,2021-August-10,5
2,15,2021-August-10,3
3,87,2021-August-10,2
4,15,2021-August-11,1
5,96,2021-August-11,1
6,23,2021-August-11,2
7,87,2021-August-12,2
8,97,2021-August-12,6
9,97,2021-August-12,1


In [50]:
products_df = products_df.set_index('product_id')
sales_df = sales_df.set_index('product_id')

# Join the two dataframes on product_id
df_joined = products_df.join(sales_df)

In [51]:
df_joined

Unnamed: 0_level_0,name,wholesale_price,retail_price,department,date,quantity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23,computer,500.0,1000,electronics,2021-August-10,1
23,computer,500.0,1000,electronics,2021-August-11,2
23,computer,500.0,1000,electronics,2021-August-13,1
96,Python Workout,35.0,75,books,2021-August-10,5
96,Python Workout,35.0,75,books,2021-August-11,1
97,Pandas Workout,35.0,75,books,2021-August-12,6
97,Pandas Workout,35.0,75,books,2021-August-12,1
15,banana,0.5,1,food,2021-August-10,3
15,banana,0.5,1,food,2021-August-11,1
15,banana,0.5,1,food,2021-August-14,2


### 31.1 Main exercise

#### 01 Loading data

In [90]:
file_name_tourism = 'data/oecd_tourism.csv'
use_cols_tourism=['LOCATION', 'SUBJECT', 'TIME', 'Value']
df_tourism = pd.read_csv(file_name_tourism, usecols=use_cols_tourism)

file_name_oecd = 'data/oecd_locations.csv'
names_oecd = ['LOCATION', 'NAME']
index_col_oecd='LOCATION'
df_oecd = pd.read_csv(file_name_oecd, names=names_oecd)


In [53]:
df_tourism.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,31159.8
1,AUS,INT_REC,2009,29980.7
2,AUS,INT_REC,2010,35165.5
3,AUS,INT_REC,2011,38710.1
4,AUS,INT_REC,2012,38003.7


In [74]:
df_tourism.shape, df_tourism['LOCATION'].nunique()

((1234, 4), 54)

#### 02 Debugging

In [None]:
# ========== DEBUGGING ==========
df_tourism_group = df_tourism.groupby('LOCATION')
df_tourism_group_list = list(df_tourism_group)
country, df_country = df_tourism_group_list[0]
len(df_tourism_group_list), type(df_tourism_group_list[0]), len(df_tourism_group_list[0]), \
    type(df_tourism_group_list[0][0]), type(df_tourism_group_list[0][1]), \
    country, df_country.head()

(54,
 tuple,
 2,
 str,
 pandas.core.frame.DataFrame,
 'AUS',
   LOCATION  SUBJECT  TIME    Value
 0      AUS  INT_REC  2008  31159.8
 1      AUS  INT_REC  2009  29980.7
 2      AUS  INT_REC  2010  35165.5
 3      AUS  INT_REC  2011  38710.1
 4      AUS  INT_REC  2012  38003.7)

In [None]:
df_tourism_group_value = df_tourism.groupby('LOCATION')['Value']
df_tourism_group_value_list = list(df_tourism_group_value)
country_value, df_country_value = df_tourism_group_value_list[0]
df_country_value.head()
# ========== DEBUGGING ==========

0    31159.8
1    29980.7
2    35165.5
3    38710.1
4    38003.7
Name: Value, dtype: float64

In [85]:
df_tourism['SUBJECT'].unique()

array(['INT_REC', 'INT-EXP'], dtype=object)

In [83]:
# Set format for the output
pd.options.display.float_format = '${:,.0f}'.format

#### 03 Five countries that received the greatest amount of tourist dollars

In [87]:
# Find the five countries that received the greatest amount of tourist dollars, on
# average, across years in the data set
mask_received = df_tourism['SUBJECT'] == 'INT_REC'
df_tourism_grouped = (
    df_tourism.loc[mask_received, :]
        .groupby('LOCATION')['Value']
        .mean()
        .sort_values(ascending=False)
)

df_tourism_grouped.head(5)

LOCATION
USA   $201,614
ESP    $69,656
FRA    $65,063
DEU    $53,409
GBR    $51,752
Name: Value, dtype: float64

#### 04 Join these two data frames together into a new one

In [88]:
df_tourism.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,"$31,160"
1,AUS,INT_REC,2009,"$29,981"
2,AUS,INT_REC,2010,"$35,166"
3,AUS,INT_REC,2011,"$38,710"
4,AUS,INT_REC,2012,"$38,004"


In [91]:
df_oecd.head()

Unnamed: 0,LOCATION,NAME
0,AUS,Australia
1,AUT,Austria
2,BEL,Belgium
3,CAN,Canada
4,DNK,Denmark


In [92]:
df_tourism['LOCATION'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN', 'FRA', 'DEU',
       'GRC', 'HUN', 'ISL', 'IRL', 'ITA', 'JPN', 'KOR', 'LUX', 'MEX',
       'NLD', 'NZL', 'NOR', 'POL', 'PRT', 'SVK', 'ESP', 'SWE', 'CHE',
       'TUR', 'GBR', 'USA', 'BRA', 'BGR', 'CHL', 'COL', 'CRI', 'HRV',
       'EGY', 'EST', 'IND', 'IDN', 'ISR', 'LVA', 'LTU', 'MLT', 'MAR',
       'PER', 'PHL', 'ROU', 'RUS', 'SVN', 'ZAF', 'CHN', 'KAZ', 'SRB'],
      dtype=object)

In [93]:
df_oecd['LOCATION'].unique()

array(['AUS', 'AUT', 'BEL', 'CAN', 'DNK', 'FIN', 'FRA', 'DEU', 'HUN',
       'ITA', 'JPN', 'KOR', 'GBR', 'USA', 'BRA', 'ISR'], dtype=object)

In [94]:
# Merge the two dataframes on LOCATION with an inner method
df_tourism_merged = pd.merge(df_tourism, df_oecd, on='LOCATION', how='inner')

In [98]:
df_tourism_merged.shape, df_tourism.shape, df_tourism_merged['LOCATION'].unique()

((364, 5),
 (1234, 4),
 array(['AUS', 'AUT', 'BEL', 'CAN', 'DNK', 'FIN', 'FRA', 'DEU', 'HUN',
        'ITA', 'JPN', 'KOR', 'GBR', 'USA', 'BRA', 'ISR'], dtype=object))

#### 05 Five countries that received the greatest amount of tourist dollars (2)

In [100]:
df_tourism_merged.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value,NAME
0,AUS,INT_REC,2008,"$31,160",Australia
1,AUS,INT_REC,2009,"$29,981",Australia
2,AUS,INT_REC,2010,"$35,166",Australia
3,AUS,INT_REC,2011,"$38,710",Australia
4,AUS,INT_REC,2012,"$38,004",Australia


In [101]:
# Rearrange the columns to have NAME after LOCATION
df_tourism_merged = df_tourism_merged[['LOCATION', 'NAME', 'SUBJECT', 'TIME', 'Value']]

df_tourism_merged.head()

Unnamed: 0,LOCATION,NAME,SUBJECT,TIME,Value
0,AUS,Australia,INT_REC,2008,"$31,160"
1,AUS,Australia,INT_REC,2009,"$29,981"
2,AUS,Australia,INT_REC,2010,"$35,166"
3,AUS,Australia,INT_REC,2011,"$38,710"
4,AUS,Australia,INT_REC,2012,"$38,004"


In [99]:
# Five countries that received the greatest amount of tourist dollars with country names
df_tourism_merged_grouped = (
    df_tourism_merged.loc[mask_received, :]
        .groupby('NAME')['Value']
        .mean()
        .sort_values(ascending=False)
        .reset_index()
)

df_tourism_merged_grouped.head(5)

Unnamed: 0,NAME,Value
0,United States,"$182,918"
1,France,"$65,063"
2,United Kingdom,"$55,196"
3,Germany,"$53,409"
4,Italy,"$44,930"


### 31.2 Beyond 1

In [102]:
df_tourism.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,"$31,160"
1,AUS,INT_REC,2009,"$29,981"
2,AUS,INT_REC,2010,"$35,166"
3,AUS,INT_REC,2011,"$38,710"
4,AUS,INT_REC,2012,"$38,004"


In [104]:
# Get the mean tourism income per year rather than by country
df_tourism_merged_grouped_year = (
    df_tourism_merged.loc[mask_received, :]
        .groupby('TIME')['Value']
        .mean()
        .sort_values(ascending=False)
        .reset_index()
)

df_tourism_merged_grouped_year

Unnamed: 0,TIME,Value
0,2019,"$53,772"
1,2016,"$46,554"
2,2018,"$41,593"
3,2017,"$39,086"
4,2015,"$38,969"
5,2014,"$38,072"
6,2013,"$37,996"
7,2012,"$35,629"
8,2011,"$34,300"
9,2008,"$31,757"


### 31.3 Beyond 2

In [105]:
df_tourism_merged.shape

(364, 5)

In [106]:
df_tourism_merged2 = pd.merge(df_tourism, df_oecd, on='LOCATION', how='right')
df_tourism_merged3 = pd.merge(df_oecd, df_tourism, on='LOCATION', how='left')

df_tourism_merged2.shape, df_tourism_merged3.shape

((364, 5), (364, 5))