In [None]:
import pandas as pd
import gc 
import warnings
import numpy as np
from datetime import timedelta
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster
from sklearn.cluster import KMeans
import geopandas as gpd
import seaborn as sns
from bokeh.plotting import figure, show
from datetime import timedelta
from bokeh.tile_providers import get_provider, Vendors
from bokeh.io import save, reset_output, output_notebook
from pandas.plotting import parallel_coordinates
from math import log
import datetime as dt
from folium.plugins import HeatMap
%matplotlib inline

## !!!!!:This notebook creates lots of plots, so you will need to run this notebook again to see all graphs.!

# 1 Income 2019 datasets

Assumptions made here are:
 1. We use the data of current year to evaluate the current year income level 
 2. The median income csv for 2019 is created manually with information retrieved from the website: 
https://www.census.gov/quickfacts/fact/table/newyorkcountymanhattanboroughnewyork,bronxcountybronxboroughnewyork,queenscountyqueensboroughnewyork,kingscountybrooklynboroughnewyork,richmondcountystatenislandboroughnewyork,newyorkcitynewyork/HSG010219

In [None]:
income_data_path = r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\income_2019.csv'
taxi_zone_lookup_path = r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\taxi_zone_lookup.csv'
taxi_zone_lookup_df = pd.read_csv(taxi_zone_lookup_path)
income_df = pd.read_csv(income_data_path)
income_df['Median Income']  = income_df['Median Income'].astype(int)
taxi_zone_lookup_df = taxi_zone_lookup_df.loc[(taxi_zone_lookup_df['LocationID'] != 264)&(taxi_zone_lookup_df["LocationID"] != 265),:]
taxi_zone_lookup_df['Zone']  = taxi_zone_lookup_df['Zone'].str.lower()
income_2019  = income_df.dropna()

## Visualisation of incme level for different regions

In [None]:
# open the map datasete
taxi_zone_gpd= gpd.read_file(r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\Taxi Zones.geojson")
taxi_zone_gpd = gpd.GeoDataFrame(taxi_zone_gpd)
taxi_zone_gpd["location_id"]  = taxi_zone_gpd["location_id"].astype(int)
taxi_zone_gpd["zone"]  = taxi_zone_gpd["zone"].str.lower()
geoJSON = taxi_zone_gpd[['location_id','geometry']].to_json()

In [None]:
taxi_zone_gpd.info()

In [None]:
taxi_zone_lookup_df.info()

In [None]:
Income_taxi_zone_merging_2019df = pd.merge(taxi_zone_lookup_df, income_2019, left_on = 'Borough', right_on = 'Borough')
Income_taxi_zone_merging_2019df.info()


In [None]:
# Visualize the map for the median income.
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
m.add_child(folium.Choropleth(geo_data=geoJSON, name='choropleth'))
folium.Choropleth(
    geo_data=geoJSON,
    data=Income_taxi_zone_merging_2019df,
    columns=["LocationID", "Median Income"],
    key_on="properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Income_2019_ChoroplethMap.html')
m


In [None]:
# Barplot of the median income by borough
plt.xlabel("Boroughs")
plt.ylabel('Median Income')
plt.title(r'Barplot of 2019 Median Income by Boroughs')
X =list(np.array(income_2019["Borough"]))
Y = income_2019["Median Income"]
plt.bar(X, Y)

# 2 yellow taxi trips 2019 datasets

In [None]:
yellow_2019_file_path = r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2019\yellow_taxi_2019\yellow_2019_01_04_preprocessed.feather"
yellow_2019 = pd. read_feather(yellow_2019_file_path)

In [None]:
# Transform the time into the unit of seconds
yellow_2019['travel_duration_in_seconds'] = yellow_2019['travel_duration']/timedelta(seconds = 1)
yellow_2019 =  yellow_2019.drop(['level_0','index'], axis = 1)
yellow_2019.info()
yellow_2019.head()

## Create the common grouping for the later analysis

In [None]:
Locaiton_info = ["PULocationID","DOLocationID"]
trip_time = ["	pick_date_time","drop_date_time",'travel_duration']

In [None]:
new_features = ['average_speed','average_fare','tip_fare_r']
amounts = ['fare_amount','tip_amount','total_amount']

## 2.1 Analysis of Distributions of features

In [None]:
lst2 =['trip_distance',
 'fare_amount',
 'tip_amount',
 'travel_duration_in_seconds',
 'average_speed',
 'average_fare',
 'tip_fare_r']


In [None]:
yellow_2019[lst2].describe()

In [None]:
# Create the columns for different units of time
yellow_2019['year']= yellow_2019['pick_date_time'].dt.year
yellow_2019['month']= yellow_2019['pick_date_time'].dt.month
yellow_2019['date']= yellow_2019['pick_date_time'].dt.date
yellow_2019['day']= yellow_2019['pick_date_time'].dt.day
yellow_2019['time']= yellow_2019['pick_date_time'].dt.time
yellow_2019['week']= yellow_2019['pick_date_time'].dt.week



In [None]:
# Mapping the Weekdays into the words for better demonstration purposes
dw_mapping={
    0: 'Monday', 
    1: 'Tuesday', 
    2: 'Wednesday', 
    3: 'Thursday', 
    4: 'Friday',
    5: 'Saturday', 
    6: 'Sunday'
} 
month_mapping={
    1: 'Jan', 
    2: 'Feb', 
    3: 'Mar', 
    4: 'Apri', 
} 
yellow_2019['day_of_week']=yellow_2019['pick_date_time'].dt.weekday.map(dw_mapping)
yellow_2019['month_in_words']=yellow_2019['pick_date_time'].dt.month.map(month_mapping)
yellow_2019['travel_duration_in_seconds'] = (yellow_2019['travel_duration']/timedelta (seconds=1)).astype(int)

In [None]:
# Check point to save the file and make sure the data is not lost.
yellow_2019.to_feather(r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2019\yellow_taxi_2019\yellow_2019_01_04_preprocessed_2.feather")
del yellow_2019
yellow_2019 = pd.read_feather(r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2019\yellow_taxi_2019\yellow_2019_01_04_preprocessed_2.feather')

## Visualize and explore the relationships for different features 

In [None]:
# plot mutiple plots
fig, axs = plt.subplots(1, 3, sharey=True, tight_layout=True,figsize=(15, 5))
fig.suptitle("histograms of tip, fare and total amounts in 2019/01 - 2019/05")
axs[0].hist(yellow_2019['tip_amount'], bins=70, range = (0,20), color = 'red')
axs[1].hist(yellow_2019['fare_amount'], bins=40, range = (0,60),color = 'blue')
axs[2].hist(yellow_2019['total_amount'], bins=80, range = (0,80),color = 'black')
axs[0].set_title("Tip amount")
axs[1].set_title("Fare amount")
axs[2].set_title("Total amount")
axs[0].set(xlabel="amount", ylabel="a,o")
axs[1].set(xlabel="amount", ylabel="Frequency(10^6)")
axs[2].set(xlabel="amount", ylabel="Frequency(10^6)")
plt.show()

In [None]:
# plot mutiple plots
fig, axs = plt.subplots(1, 3, sharey=True, tight_layout=True,figsize=(15, 5))
fig.suptitle("histograms of tip, fare and total amounts in 2019/01 - 2019/05 after log transformation")
axs[0].hist(yellow_2019['tip_amount'].apply(lambda x: log(x)), bins=40, range = (-1,3), color = 'red')
axs[1].hist(yellow_2019['fare_amount'].apply(lambda x: log(x)), bins=20,range = (1.0,4),color = 'blue')
axs[2].hist(yellow_2019['total_amount'].apply(lambda x: log(x)), bins=50,range = (0,5), color = 'black')
axs[0].set_title("Tip amount")
axs[1].set_title("Fare amount")
axs[2].set_title("Total amount")
axs[0].set(xlabel="log(amount)", ylabel="a,o")
axs[1].set(xlabel="log(amount)", ylabel="Frequency(10^6)")
axs[2].set(xlabel="log(amount)", ylabel="Frequency(10^6)")
plt.show()

In [None]:
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.title(r'Histogram of Pick up times')
plot = plt.hist(yellow_2019['date'], bins=30)
plt.xticks(rotation=45)

In [None]:
plt.xlabel('Types of amout components')
plt.ylabel('Disribution')
plt.title(r'Boxplots of amount')
plt.xticks(rotation=45)
boxplot = yellow_2019.boxplot(column=amounts)
plt.show()

In [None]:
plt.xlabel('Trip_distances')
plt.ylabel('Disribution')
plt.title(r'Boxplots of amount')
plt.xticks(rotation=45)
boxplot = yellow_2019.boxplot(column='trip_distance')
plt.show()

In [None]:
plt.xlabel('Types of amout components')
plt.ylabel('Disribution')
plt.title(r'Boxplots of amount')
plt.xticks(rotation=45)
plt.hist(yellow_2019["trip_distance"], bins = 100,range = (0,20))
plt.show()

## Create checkpoints for the dataset

In [None]:
new =  yellow_2019.merge(taxi_zone_lookup_df,left_on = 'PULocationID', right_on = 'LocationID').drop(['Zone','service_zone','LocationID'],axis = 1)
del  yellow_2019
yellow_2019 = new

## More plots of Pick-up related to time

In [None]:
plt.xlabel('day of week')
plt.ylabel('Frequency')
plt.title(r'Histogram of 2019 pick_up by days of week:')
yellow_2019['day_of_week'].value_counts().plot(kind='bar',rot=20)

In [None]:
plt.xlabel('Borough')
plt.ylabel('Frequency')
plt.title(r'Barplot of 2019 pick_up  number by Borough after log transformation')
plt.ylabel('Frequency(log(x)):')
yellow_2019['Borough'].value_counts().apply(lambda x: log(x)).plot(kind='bar',rot=0)
plt.savefig('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Barplot_of_pick_up_by_Borough.png')

In [None]:
yellow_2019['Borough'].value_counts()

In [None]:
plt.xlabel('Day time')
plt.ylabel("Relative Frequency")
plt.title(r'Barplot of 2019 Jan to April pick_up counts by day time')
plt.ylabel('Frequency:')
m = sns.distplot(yellow_2019['time'].apply(lambda x : x.hour),bins = 24,color = 'crimson')
plt.legend(loc="upper right")
plt.savefig('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Barplot_of_yellow_2019_by_datatime.png')

In [None]:
removing_list= ["trip_distance","fare_amount", "tip_amount", "total_amount", "travel_duration_in_seconds"]
lst = removing_list  + ['average_speed','average_fare','tip_fare_r']

In [None]:
Q3 = yellow_2019[lst].quantile(0.75)
Q1 = yellow_2019[lst].quantile(0.25)
Interquantile_range = Q3 - Q1

In [None]:
Interquantile_range

## We are trying to be conservative here to use Q1/Q3 $\pm$ + $6\times$IQR as a rule to remove outliers

### --> This will implies that we will remove the payment for the trip that is cater to RateCodeID of JFK.
### In this study, the Airport will not be main focus.

In [None]:
multipler = 6
yellow_2019_new = yellow_2019.loc[(yellow_2019["trip_distance"] > Q1["trip_distance"] - multipler*Interquantile_range["trip_distance"]) \
                                  & (yellow_2019["trip_distance"] < Q3["trip_distance"] + multipler*Interquantile_range["trip_distance"] ) & (yellow_2019["fare_amount"] > Q1["fare_amount"] - 3*Interquantile_range["fare_amount"] ) \
                                  & (yellow_2019["fare_amount"] < Q3["fare_amount"] + multipler*Interquantile_range["fare_amount"] )  & (yellow_2019["tip_amount"] > Q1["tip_amount"] - multipler*Interquantile_range["tip_amount"] ) \
                                  & (yellow_2019["tip_amount"] < Q3["tip_amount"] + multipler*Interquantile_range["tip_amount"] ) & (yellow_2019["total_amount"] > Q1["total_amount"] - multipler*Interquantile_range["total_amount"] ) \
                                  & (yellow_2019["total_amount"] < Q3["total_amount"] + multipler*Interquantile_range["total_amount"] )  & (yellow_2019["average_speed"] > Q1["average_speed"] - multipler*Interquantile_range["average_speed"] ) \
                                  & (yellow_2019["average_speed"] < Q3["average_speed"] + multipler*Interquantile_range["average_speed"] ) & (yellow_2019["average_fare"] > Q1["average_fare"] - multipler*Interquantile_range["average_fare"] ) \
                                  & (yellow_2019["average_fare"] < Q3["average_fare"] + multipler*Interquantile_range["average_fare"] ) & (yellow_2019["tip_fare_r"] > Q1["tip_fare_r"] - multipler*Interquantile_range["tip_fare_r"] ) \
                                  & (yellow_2019["tip_fare_r"] < Q3["tip_fare_r"] + multipler*Interquantile_range["tip_fare_r"] ) & (yellow_2019["travel_duration_in_seconds"] > Q1["travel_duration_in_seconds"] - multipler*Interquantile_range["travel_duration_in_seconds"] ) \
                                  & (yellow_2019["travel_duration_in_seconds"] < Q3["travel_duration_in_seconds"] + multipler*Interquantile_range["travel_duration_in_seconds"] ),:]
del yellow_2019
yellow_2019 = yellow_2019_new

In [None]:
yellow_2019.reset_index().to_feather(r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2019\yellow_taxi_2019\yellow_2019_01_04_preprocessed_3.feather")
del yellow_2019


In [None]:
yellow_2019 = pd.read_feather(r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2019\yellow_taxi_2019\yellow_2019_01_04_preprocessed_3.feather')

# 3.1 yellow taxi trips 2020 datasets and do the preprocessing same as the above

In [None]:
yellow_2020_file_path = r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2020\yellow_taxi_2020\yellow_2020_01_04_preprocessed.feather"
yellow_2020 = pd. read_feather(yellow_2020_file_path)


In [None]:
yellow_2020['travel_duration_in_seconds'] = yellow_2020['travel_duration']/timedelta(seconds = 1)
yellow_2020 =  yellow_2020.drop(['level_0','index'], axis = 1)
yellow_2020.info()
yellow_2020.head()

In [None]:
yellow_2020['year']= yellow_2020['pick_date_time'].dt.year
yellow_2020['month']= yellow_2020['pick_date_time'].dt.month
yellow_2020['date']= yellow_2020['pick_date_time'].dt.date
yellow_2020['day']= yellow_2020['pick_date_time'].dt.day
yellow_2020['time']= yellow_2020['pick_date_time'].dt.time
yellow_2020['week']= yellow_2020['pick_date_time'].dt.week

yellow_2020['day_of_week']=yellow_2020['pick_date_time'].dt.weekday.map(dw_mapping)
yellow_2020['month_in_words']=yellow_2020['pick_date_time'].dt.month.map(month_mapping)
yellow_2020['travel_duration_in_seconds'] = (yellow_2020['travel_duration']/timedelta (seconds=1)).astype(int)

In [None]:
yellow_2020.to_feather(r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2020\yellow_taxi_2020\yellow_2020_01_04_preprocessed_2.feather")
del yellow_2020
yellow_2020 = pd.read_feather(r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2020\yellow_taxi_2020\yellow_2020_01_04_preprocessed_2.feather')

In [None]:
new =  yellow_2020.merge(taxi_zone_lookup_df,left_on = 'PULocationID', right_on = 'LocationID').drop(['Zone','service_zone','LocationID'],axis = 1)
del  yellow_2020
yellow_2020 = new

In [None]:
Q3 = yellow_2020[lst].quantile(0.75)
Q1 = yellow_2020[lst].quantile(0.25)
Interquantile_range = Q3 - Q1

In [None]:
Interquantile_range

In [None]:
multipler = 6
yellow_2020_new = yellow_2020.loc[(yellow_2020["trip_distance"] > Q1["trip_distance"] - multipler*Interquantile_range["trip_distance"]) \
                                  & (yellow_2020["trip_distance"] < Q3["trip_distance"] + multipler*Interquantile_range["trip_distance"] ) & (yellow_2020["fare_amount"] > Q1["fare_amount"] - 3*Interquantile_range["fare_amount"] ) \
                                  & (yellow_2020["fare_amount"] < Q3["fare_amount"] + multipler*Interquantile_range["fare_amount"] )  & (yellow_2020["tip_amount"] > Q1["tip_amount"] - multipler*Interquantile_range["tip_amount"] ) \
                                  & (yellow_2020["tip_amount"] < Q3["tip_amount"] + multipler*Interquantile_range["tip_amount"] ) & (yellow_2020["total_amount"] > Q1["total_amount"] - multipler*Interquantile_range["total_amount"] ) \
                                  & (yellow_2020["total_amount"] < Q3["total_amount"] + multipler*Interquantile_range["total_amount"] )  & (yellow_2020["average_speed"] > Q1["average_speed"] - multipler*Interquantile_range["average_speed"] ) \
                                  & (yellow_2020["average_speed"] < Q3["average_speed"] + multipler*Interquantile_range["average_speed"] ) & (yellow_2020["average_fare"] > Q1["average_fare"] - multipler*Interquantile_range["average_fare"] ) \
                                  & (yellow_2020["average_fare"] < Q3["average_fare"] + multipler*Interquantile_range["average_fare"] ) & (yellow_2020["tip_fare_r"] > Q1["tip_fare_r"] - multipler*Interquantile_range["tip_fare_r"] ) \
                                  & (yellow_2020["tip_fare_r"] < Q3["tip_fare_r"] + multipler*Interquantile_range["tip_fare_r"] ) & (yellow_2020["travel_duration_in_seconds"] > Q1["travel_duration_in_seconds"] - multipler*Interquantile_range["travel_duration_in_seconds"] ) \
                                  & (yellow_2020["travel_duration_in_seconds"] < Q3["travel_duration_in_seconds"] + multipler*Interquantile_range["travel_duration_in_seconds"] ),:]
del yellow_2020
yellow_2020 = yellow_2020_new

# Save the preprocessed dataset

In [None]:
yellow_2020.reset_index().to_feather(r"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2020\yellow_taxi_2020\yellow_2020_01_04_preprocessed_3.feather")
del yellow_2020

In [None]:
yellow_2020 = pd.read_feather(r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\2020\yellow_taxi_2020\yellow_2020_01_04_preprocessed_3.feather')

-----------------------------------------------------------------------

## Calculate the correlation between the numerical features

In [None]:
lst = removing_list  + ['average_speed','average_fare','tip_fare_r']
result = yellow_2019[lst].corr()

In [None]:
result_matrix = np.array(result)
heatmap  = sns.heatmap(result_matrix, annot=True,yticklabels=lst, xticklabels = lst)
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\heatmap of numeric features.png")
plt.xticks(rotation=300)
plt.show()

 ### The relationship between amounts are easy to figure and there is not any suprise that they have strong correlatin with each other

In [None]:
# plot mutiple plots

plt.title("Scatter plots of amount against travel distance 2019/01 - 2019/05")
plt.scatter(yellow_2019['trip_distance'], yellow_2019['average_speed'],alpha=0.5)
plt.xlabel('Trip distance')
plt.ylabel("average_speed")
plt.show()

In [None]:
# plot mutiple plots

plt.title("Scatter plots of amount against travel distance 2019/01 - 2019/05")
plt.scatter(yellow_2019['average_fare'], yellow_2019['travel_duration_in_seconds'],alpha=0.5)
plt.xlabel('Average_fare')
plt.ylabel("travel_duration_in_seconds")
plt.show()

In [None]:
plt.title("The scatter plot of average_fare against trip distance")
plt.scatter(yellow_2019['trip_distance'], yellow_2019['average_fare'],alpha=0.7)
plt.xlabel('Trip distance')
plt.ylabel("Average fare")
plt.show()

In [None]:

plt.scatter(yellow_2019['trip_distance'], yellow_2019['travel_duration_in_seconds'],alpha=0.5)
plt.show()

In [None]:
# plot mutiple plots
fig, axs = plt.subplots(1, 3, sharey=True, tight_layout=True,figsize=(15, 5))
fig.suptitle("Scatter plots of amount against travel distance 2019/01 - 2019/05")
axs[0].scatter(yellow_2019['trip_distance'], yellow_2019['tip_amount'],alpha=0.5)
axs[1].scatter(yellow_2019['trip_distance'], yellow_2019['fare_amount'], alpha=0.5)
axs[2].scatter(yellow_2019['trip_distance'], yellow_2019['total_amount'],alpha=0.5)
axs[0].set_title("Tip amount")
axs[1].set_title("Fare amount")
axs[2].set_title("Total amount")
axs[0].set(xlabel="distance", ylabel="amount")
axs[1].set(xlabel="distance", ylabel="amount")
axs[2].set(xlabel="distance", ylabel="amount")
plt.show()


In [None]:
# plot mutiple plots
fig, axs = plt.subplots(1, 3, sharey=True, tight_layout=True,figsize=(15, 5))
fig.suptitle("Scatter plots of amount against travel time in  2019/01 - 2019/05")
axs[0].scatter(yellow_2019['travel_duration_in_seconds'], yellow_2019['tip_amount'],alpha=0.5)
axs[1].scatter(yellow_2019['travel_duration_in_seconds'], yellow_2019['fare_amount'], alpha=0.5)
axs[2].scatter(yellow_2019['travel_duration_in_seconds'], yellow_2019['total_amount'],alpha=0.5)
axs[0].set_title("Tip amount")
axs[1].set_title("Fare amount")
axs[2].set_title("Total amount")
axs[0].set(xlabel="travel duration in second", ylabel="amount")
axs[1].set(xlabel="travel duration in second", ylabel="amount")
axs[2].set(xlabel="travel duration in second", ylabel="amount")
plt.show()


In [None]:
# plot mutiple plots
fig, axs = plt.subplots(1, 3, sharey=True, tight_layout=True,figsize=(15, 5))
fig.suptitle("Scatter plots of amount against average speed 2019/01 - 2019/05")
axs[0].scatter(yellow_2019['average_speed'], yellow_2019['tip_amount'],alpha=0.5)
axs[1].scatter(yellow_2019['average_speed'], yellow_2019['fare_amount'], alpha=0.5)
axs[2].scatter(yellow_2019['average_speed'], yellow_2019['total_amount'],alpha=0.5)
axs[0].set_title("Tip amount")
axs[1].set_title("Fare amount")
axs[2].set_title("Total amount")
axs[0].set(xlabel="speed(mile/hr)", ylabel="amount")
axs[1].set(xlabel="speed(mile/hr)", ylabel="amount")
axs[2].set(xlabel="speed(mile/hr)", ylabel="amount")
plt.show()


# 2.2 Geo-spatial Visualization of Yellow trip Pick_ups 

In [None]:
yellow_2019.info()

In [None]:
# method 1 to create the count def for frequ
count_df = yellow_2019["PULocationID"].value_counts()
data  = {"PULocationID":count_df.index,"count":count_df}
by_locaitonID_count_df = pd. DataFrame(data)
by_locaitonID_count_df

In [None]:
# method 2 to create the count def for frequency
df = pd.DataFrame(yellow_2019[['PULocationID']].groupby('PULocationID')["PULocationID"].count())
df["count"]  = df["PULocationID"]
pick_up_count_df = df.drop ("PULocationID",axis = 1)
by_locaitonID_count_df = pick_up_count_df.reset_index()
by_locaitonID_count_df

In [None]:
#  Join the two dataframe to see the aggregated dataset for Geospatial Visualisation
average_df = yellow_2019[['PULocationID'] +lst].groupby("PULocationID").mean()
average_df = average_df.reset_index()
average_df.head(10)

In [None]:
#  Merge two datafeames with their PULocations ID
yellow_2019_loc_df = pd.merge(average_df,by_locaitonID_count_df, left_on = 'PULocationID', right_on =  'PULocationID')
yellow_2019_loc_df

## Data Visualization based on pick_up locations

In [None]:
# locaiton heat map of pick up density in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "count"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="Pick_up frequency",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_pick_up_2019_ChoroplethMap.html')
m

In [None]:
# locaiton heat map of average trip total amount in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "total_amount"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="Average Total amount($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_aver_total_amount_2019_ChoroplethMap.html')
m

In [None]:
# locaiton heat map of average trip distance in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "trip_distance"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="Average distance(mile)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_aver_trip_distance_2019_ChoroplethMap.html')
m

In [None]:
# locaiton heat map of tip amount in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "tip_amount"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="average tip amount($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_average_tip_amout_2019_ChoroplethMap.html')
m

In [None]:
# locaiton heat map of tip/fare ratio in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "tip_fare_r"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="tip/fara ratio($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_tip_fare_ratio_2019_ChoroplethMap.html')
m

In [None]:

# locaiton heat map of travel time in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "travel_duration_in_seconds"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="average travel time in second($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_trave_time_2019_ChoroplethMap.html')
m

In [None]:
# locaiton heat map of average fare in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=yellow_2019_loc_df,
    columns=["PULocationID", "average_fare"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="average fare per mile ($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_average_fare_2019_ChoroplethMap.html')
m

## Data Visualization based on Drop_off loctions

In [None]:
count_df = yellow_2019["DOLocationID"].value_counts()
data  = {"LocationID":count_df.index,"Count":count_df}
by_dflocaitonID_count_df = pd. DataFrame(data)
by_dflocaitonID_count_df

In [None]:
# locaiton heat map of drop off  density in city (2019/01 - 2019/04)
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=by_dflocaitonID_count_df,
    columns=["LocationID", "Count"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name=" Drop off Count",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_drop_off_2019_ChoroplethMap.html')
m

## 2.3 Geo-spatial Visualization and Analysis of interaction between the demand of yellow_taxi(number of pick_up), Shooting incidents, income groups in different regions and city districts

In [None]:
shooting_2019 = pd.read_csv(f"F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\Crime_and_protest\shooting_2019.csv")
shooting_2019.head(10)

In [None]:
clustering_2019 = np.array(shooting_2019[["Latitude","Longitude"]])
kmeans = KMeans(n_clusters=3, random_state=0).fit(clustering_2019)

In [None]:
m= folium.Map(location=[40.66, -73.94],  zoom_start=10)
#tiles="Stamen Terrain",
folium.Choropleth(
    geo_data=geoJSON,
    data=by_locaitonID_count_df,
    columns=["PULocationID", "count"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.5,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\yellow_pick_up_2019_ChoroplethMap.html')
m

for i in range(1, len(clustering_2019)):
    
    
    by_locaitonID_count_df
    if kmeans.labels_[i] ==0:
        folium.Marker(
            location=[clustering_2019[i][0],clustering_2019[i][1]],
            popup="Shooting Event",
            icon=folium.Icon(color="blue", icon="info-sign"),
        ).add_to(m)
    elif kmeans.labels_[i] ==1:
          folium.Marker(
            location=[clustering_2019[i][0],clustering_2019[i][1]],
            popup="Shooting Event",
            icon=folium.Icon(color="red", icon="info-sign"),
        ).add_to(m)
    elif kmeans.labels_[i] ==2:
          folium.Marker(
            location=[clustering_2019[i][0],clustering_2019[i][1]],
            popup="Shooting Event",
            icon=folium.Icon(color="green", icon="info-sign"),
        ).add_to(m)



m.add_child(folium.LatLngPopup())
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_clustering_combined_pick_up_ChoroplethMap.html')
m

In [None]:
m= folium.Map(location=[40.66, -73.94],  zoom_start=10)
#tiles="Stamen Terrain",
folium.Choropleth(
    geo_data=geoJSON,
    data=by_locaitonID_count_df,
    columns=["PULocationID", "count"],
    key_on = "properties.location_id",
    fill_color="BuPu",
    fill_opacity=0.9,
    line_opacity=0.9,
    legend_name="Pick up count($)",
    reset=True,
).add_to(m)
m.add_child(FastMarkerCluster(data=shooting_2019[['Latitude','Longitude']].values))


m.add_child(folium.LatLngPopup())
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_clustering_foliumChoroplethMap.html')
m

In [None]:
income_df

##  The relationship between the gun violence and income level in various boroughs

In [None]:
borough_df = gpd.read_file(r'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\Borough Boundaries.geojson')
geoJSON =  borough_df.to_json()

In [None]:
borough_df

In [None]:
income_df

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
m.add_child(folium.Choropleth(geo_data=geoJSON, name='choropleth'))
folium.Choropleth(
    geo_data=geoJSON,
    data=income_df,
    columns=["Borough", "Median Income"],
    key_on="properties.boro_name",
    fill_color="BuPu",
    fill_opacity=0.9,
    line_opacity=0.9,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.add_child(FastMarkerCluster(data=shooting_2019[['Latitude','Longitude']].values))
m.add_child(HeatMap(shooting_2019[['Latitude','Longitude']].values, radius=10))
m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Income_2019_and_shooting_ChoroplethMap.html')
m


In [None]:
city_zone = gpd.read_file(f'F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\city_zone.geojson')
city_zone.info()

In [None]:
city_zone_commerical = city_zone[city_zone["ZONEDIST"].str.contains("C[0-9].")]
city_zone_commerical.head(10)

In [None]:
geoJSON2 = city_zone_commerical[['ZONEDIST','geometry']].to_json()
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=income_df,
    columns=["Borough", "Median Income"],
    key_on="properties.boro_name",
    fill_color="RdPu",
    fill_opacity=0.5,
    line_opacity=0.7,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.add_child(folium.Choropleth(geo_data=geoJSON2, name='choropleth'))

m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_income_level_with_Czones_foliumChoroplethMap.html')
m

In [None]:
city_zone_residentialL = city_zone[city_zone["ZONEDIST"].str.contains("R[0-5]")].sort_values("ZONEDIST",  ascending=True)
city_zone_residentialL

In [None]:
geoJSON2 = city_zone_residentialL[['ZONEDIST','geometry']].to_json()
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=income_df,
    columns=["Borough", "Median Income"],
    key_on="properties.boro_name",
    fill_color="RdPu",
    fill_opacity=0.5,
    line_opacity=0.7,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.add_child(folium.Choropleth(geo_data=geoJSON2, name='choropleth'))

m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_income_level_with_Czones_foliumChoroplethMap.html')
m

In [None]:
city_zone_residentialH = city_zone[city_zone["ZONEDIST"].str.contains("R6")].sort_values("ZONEDIST",  ascending=True)
city_zone_residentialH = city_zone_residentialH.append(city_zone[city_zone["ZONEDIST"].str.contains("R7")])
city_zone_residentialH = city_zone_residentialH.append(city_zone[city_zone["ZONEDIST"].str.contains("R8")])
city_zone_residentialH = city_zone_residentialH.append(city_zone[city_zone["ZONEDIST"].str.contains("R9")])
city_zone_residentialH = city_zone_residentialH.append(city_zone[city_zone["ZONEDIST"].str.contains("R10")])
city_zone_residentialH

In [None]:
geoJSON2 = city_zone_residentialH[['ZONEDIST','geometry']].to_json()
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=income_df,
    columns=["Borough", "Median Income"],
    key_on="properties.boro_name",
    fill_color="RdPu",
    fill_opacity=0.5,
    line_opacity=0.7,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.add_child(folium.Choropleth(geo_data=geoJSON2, name='choropleth'))

m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_income_level_with_Czones_foliumChoroplethMap.html')
m

In [None]:
city_zone_manufacturing= city_zone[city_zone["ZONEDIST"].str.contains("M")].sort_values("ZONEDIST",  ascending=True)
city_zone_manufacturing

In [None]:
geoJSON2 = city_zone_manufacturing[['ZONEDIST','geometry']].to_json()
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
folium.Choropleth(
    geo_data=geoJSON,
    data=income_df,
    columns=["Borough", "Median Income"],
    key_on="properties.boro_name",
    fill_color="RdPu",
    fill_opacity=0.5,
    line_opacity=0.7,
    legend_name="Median Income($)",
    reset=True,
).add_to(m)
m.add_child(folium.Choropleth(geo_data=geoJSON2, name='choropleth'))

m.save('F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\shooting_2019_income_level_with_Czones_foliumChoroplethMap.html')
m

In [None]:
taxi_zone_lookup_df

In [None]:
yellow_2019_loc_df = pd.merge(yellow_2019_loc_df,taxi_zone_lookup_df, left_on = "PULocationID",right_on ="LocationID")
yellow_2019_loc_df = yellow_2019_loc_df.drop("LocationID",axis = 1)
yellow_2019_loc_df

In [None]:

yellow_2019_loc_df.to_csv(r"yellow_2019_loc_df_aggregated.csv")

In [None]:
yellow_2019_loc_df_all = yellow_2019_loc_df

In [None]:
yellow_2019.head()

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x = yellow_2019_loc_df_all.loc[:,lst].values

In [None]:
x_scaled = min_max_scaler.fit_transform(x)
yellow_2019_loc_df_all_copy = pd.DataFrame(x_scaled, columns  = lst)
yellow_2019_loc_df_all_copy

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
parallel_coordinates(yellow_2019_loc_df_all_copy,'tip_amount')
ax.legend_.remove()
plt.xticks(rotation=350)
plt.title("The Parallel Coordinates of normalised numeric attributes for aggregated data by location IDs")
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Parallel_coordinates_nuer_attribs.png")
plt.show()

# Exploring the relationship between shootings and the number of pick_up within the taxi zone:

In [None]:
selected_df_2019_shooting_events_df = pd.read_csv("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\data\Crime_and_protest\shooting_2019.csv")

In [None]:
selected_df_2019_shooting_events_df = selected_df_2019_shooting_events_df.drop_duplicates(['datetime','PRECINCT'])

In [None]:
selected_df_2019_shooting_events_df.head(5)

## Feature engineering for time features and aggregate the data for trip data

In [None]:
day_length_of_three_month = 120
week_of_three_month = 17
agregated_df = pd.DataFrame(yellow_2019.groupby("PULocationID")['PULocationID'].count())
agregated_df['count'] =agregated_df['PULocationID']
agregated_df = agregated_df.drop("PULocationID", axis = 1).reset_index()
agregated_df['average_trip per day'] =agregated_df['count']/day_length_of_three_month
agregated_df['average_trip per month'] =agregated_df['count']/3
agregated_df['average_trip per week'] =agregated_df['count']/week_of_three_month

# Check if locations of shooting belong to any taxi zones or not and join them for the further analysis

In [None]:
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

selected_df_2019_shooting_events_df = selected_df_2019_shooting_events_df.reset_index()

In [None]:
locaitons_id= []
for i in range(len(selected_df_2019_shooting_events_df)):
    point =Point(selected_df_2019_shooting_events_df['Longitude'][i],selected_df_2019_shooting_events_df['Latitude'][i])
    for j in range(len(taxi_zone_gpd)):
        if taxi_zone_gpd["geometry"][j].contains(point) == True:
            locaitons_id.append(taxi_zone_gpd["objectid"][j])
            break    

In [None]:
data = {"taxi_location_ID":locaitons_id}
df = pd.DataFrame(data)
selected_df_2019_shooting_events_df  = pd.concat([selected_df_2019_shooting_events_df, df],axis = 1)
selected_df_2019_shooting_events_df['taxi_location_ID'] = selected_df_2019_shooting_events_df['taxi_location_ID'].astype(int)

In [None]:
agregated_with_shooting_selected_df  = pd.merge(agregated_df,selected_df_2019_shooting_events_df, left_on = "PULocationID", right_on  = "taxi_location_ID")
agregated_with_shooting_selected_df = agregated_with_shooting_selected_df.drop(["taxi_location_ID","Unnamed: 0","index","Lon_Lat","PRECINCT"], axis = 1)

agregated_with_shooting_selected_and_yellow_2019_df = \
pd.merge(yellow_2019,agregated_with_shooting_selected_df, left_on = "PULocationID", right_on  = "PULocationID")

### THe example 1 of the impact of a shooting event on the demand of yellow taxi on location ID 41:
![image-2.png](attachment:image-2.png)

In [None]:
agregated_with_shooting_selected_df[agregated_with_shooting_selected_df['PULocationID'] == 41]

# Choose the specific and location to explore the connection between the shooting incident and the impact of the shooting accident on the demand 

### THe example 1 of the impact of a shooting event on the demand of yellow taxi on location ID 41:


In [None]:

example1 = agregated_with_shooting_selected_and_yellow_2019_df.loc[(agregated_with_shooting_selected_and_yellow_2019_df["PULocationID"] == 41)&
                                                                   (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] >= '2019-02-01') &\
                                                                  (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] <= '2019-05-01') &(agregated_with_shooting_selected_and_yellow_2019_df["datetime"]=='2019-04-24 01:44:00+00:00') ,:]

In [None]:
# 2019/2/9 on Location ID 41
# 2019/3/18 on Location ID 41
# 2019/4/24 on Location ID 41
y = sns.displot(example1['date'], kde=True)
plt.axhline(y = 415.625, color = 'blue', linestyle = '-')
plt.axhline(y = 270, color = 'black', linestyle = '--')
plt.axvline(dt.datetime(2019,3,18),color= 'red')
plt.axvline(dt.datetime(2019,4,24),color= 'red')
plt.axvline(dt.datetime(2019,2,9),color= 'red')

#  week_boundaries
plt.axvline(dt.datetime(2019,2,4),color= 'green')
plt.axvline(dt.datetime(2019,2,11),color= 'green')
plt.axvline(dt.datetime(2019,2,18),color= 'green')
plt.axvline(dt.datetime(2019,2,25),color= 'green')
plt.axvline(dt.datetime(2019,3,4),color= 'green')
plt.axvline(dt.datetime(2019,3,11),color= 'green')
plt.axvline(dt.datetime(2019,3,18),color= 'green')
plt.axvline(dt.datetime(2019,3,25),color= 'green')
plt.axvline(dt.datetime(2019,4,1),color= 'green')
plt.axvline(dt.datetime(2019,4,8),color= 'green')
plt.axvline(dt.datetime(2019,4,15),color= 'green')
plt.axvline(dt.datetime(2019,4,22),color= 'green')
plt.axvline(dt.datetime(2019,4,29),color= 'green')



plt.title("Histogram of daily pick_up over time")
plt.xticks(rotation=85)
y.fig.set_figwidth(15)
y.fig.set_figheight(6)
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Barplot_of_pick_ups_exp1.png")


In [None]:
y = sns.displot(example1['date'].value_counts(),bins= 5, kde=True)
y.fig.set_figwidth(8)
y.fig.set_figheight(6)

plt.title("Histogram of the daily pick_up in location Id of 41")
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Histogram_of the daily pick_up in_location_Id_of_41.png")

### THe example 2 of the impact of a shooting event on the demand of yellow taxi on location ID  42:

In [None]:
agregated_with_shooting_selected_df[agregated_with_shooting_selected_df['PULocationID'] == 42]

In [None]:
example2 = agregated_with_shooting_selected_and_yellow_2019_df.loc[(agregated_with_shooting_selected_and_yellow_2019_df["PULocationID"] == 42)&\
                                                                   (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] >= '2019-02-01') &\
                                                                  (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] <= '2019-05-01')&(agregated_with_shooting_selected_and_yellow_2019_df["datetime"]=='2019-03-16 00:50:00+00:00') ,:]

In [None]:
# 2019/2/4 on Location ID 42
# 2019/2/23 on Location ID 42
# 2019/3/16 on Location ID 42
# 2019/4/7 on Location ID 42
# 2019/4/11 on Location ID 42
# 2019/4/13 on Location ID 42

value = agregated_with_shooting_selected_df[agregated_with_shooting_selected_df["PULocationID"]  == 42]['average_trip per day'].values
y = sns.displot(example2['date'], kde=True)
plt.axhline(y = 110.375, color = 'blue', linestyle = '-')
plt.axhline(y = 80, color = 'black', linestyle = '--')
plt.axvline(dt.datetime(2019,2,4),color= 'red')
plt.axvline(dt.datetime(2019,2,23),color= 'red')
plt.axvline(dt.datetime(2019,3,16),color= 'red')
plt.axvline(dt.datetime(2019,4,7),color= 'red')
plt.axvline(dt.datetime(2019,4,11),color= 'red')
plt.axvline(dt.datetime(2019,4,13),color= 'red')

#  week_boundaries
plt.axvline(dt.datetime(2019,2,4),color= 'green')
plt.axvline(dt.datetime(2019,2,11),color= 'green')
plt.axvline(dt.datetime(2019,2,18),color= 'green')
plt.axvline(dt.datetime(2019,2,25),color= 'green')
plt.axvline(dt.datetime(2019,3,4),color= 'green')
plt.axvline(dt.datetime(2019,3,11),color= 'green')
plt.axvline(dt.datetime(2019,3,18),color= 'green')
plt.axvline(dt.datetime(2019,3,25),color= 'green')
plt.axvline(dt.datetime(2019,4,1),color= 'green')
plt.axvline(dt.datetime(2019,4,8),color= 'green')
plt.axvline(dt.datetime(2019,4,15),color= 'green')
plt.axvline(dt.datetime(2019,4,22),color= 'green')
plt.axvline(dt.datetime(2019,4,29),color= 'green')


plt.title("Histogram of daily pick_up over time")
plt.xticks(rotation=85)
y.fig.set_figwidth(15)
y.fig.set_figheight(6)
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Barplot_of_pick_ups.png")
plt.show()


In [None]:
y = sns.displot(example2['date'].value_counts(),bins= 10, kde=True)
y.fig.set_figwidth(8)
y.fig.set_figheight(6)
plt.title("Histogram of the daily pick_up in location Id of 42")
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Histogram_of the daily pick_up in_location_Id_of_42.png")

### THe example 3 of the impact of a shooting event on the demand of yellow taxi on location ID 75:


In [None]:
agregated_with_shooting_selected_df[agregated_with_shooting_selected_df['PULocationID'] == 75]

In [None]:
example3 = agregated_with_shooting_selected_and_yellow_2019_df.loc[(agregated_with_shooting_selected_and_yellow_2019_df["PULocationID"] == 75)&\
                                                                   (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] >= '2019-01-10') &\
                                                                  (agregated_with_shooting_selected_and_yellow_2019_df['pick_date_time'] <= '2019-05-01')&\
                                                                   (agregated_with_shooting_selected_and_yellow_2019_df["datetime"]=='2019-02-02 19:40:00+00:00') ,:]

In [None]:
# 2019/1/19 on Location ID 75
# 2019/2/2 on Location ID 75
# 2019/4/18 on Location ID 75
y = sns.displot(example3['date'], kde=True)
plt.axhline(y = 866.866667, color = 'blue', linestyle = '-')
plt.axhline(y = 555, color = 'black', linestyle = '--')
plt.axvline(dt.datetime(2019,1,19),color= 'red')
plt.axvline(dt.datetime(2019,2,2),color= 'red')
plt.axvline(dt.datetime(2019,4,18),color= 'red')


#  week_boundaries
plt.axvline(dt.datetime(2019,1,14),color= 'green')
plt.axvline(dt.datetime(2019,1,21),color= 'green')
plt.axvline(dt.datetime(2019,1,28),color= 'green')
plt.axvline(dt.datetime(2019,2,4),color= 'green')
plt.axvline(dt.datetime(2019,2,4),color= 'green')
plt.axvline(dt.datetime(2019,2,11),color= 'green')
plt.axvline(dt.datetime(2019,2,18),color= 'green')
plt.axvline(dt.datetime(2019,2,25),color= 'green')
plt.axvline(dt.datetime(2019,3,4),color= 'green')
plt.axvline(dt.datetime(2019,3,11),color= 'green')
plt.axvline(dt.datetime(2019,3,18),color= 'green')
plt.axvline(dt.datetime(2019,3,25),color= 'green')
plt.axvline(dt.datetime(2019,4,1),color= 'green')
plt.axvline(dt.datetime(2019,4,8),color= 'green')
plt.axvline(dt.datetime(2019,4,15),color= 'green')
plt.axvline(dt.datetime(2019,4,22),color= 'green')
plt.axvline(dt.datetime(2019,4,29),color= 'green')



plt.title("Changes in daily pick_up over time on locationID 75")
plt.xticks(rotation=85)
y.fig.set_figwidth(15)
y.fig.set_figheight(6)
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Barplot_of_pick_ups_exp3.png")


In [None]:
y = sns.displot(example3['date'].value_counts(),bins= 10, kde=True)
y.fig.set_figwidth(8)
y.fig.set_figheight(6)
plt.title("Histogram of the daily pick_up in location Id of 75")
plt.savefig("F:\Ads_projects\project1\mast30034_2021_s2_project_1-junzhin\plots\Histogram_of the daily pick_up in_location_Id_of_75.png")