In [40]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pylab as plt
from dataprep.eda import *
import os

In [41]:
# Link folder https://drive.google.com/drive/folders/1AD3DevmNPZSQ1dzFL1rE9gM0zSU1kyYy?usp=sharing
# In your Google Drive: Share with me in the left taskbar --> Right mouse button to 'DL' folder --> Add shortcut to Drive --> My Drive 
# Tutorial: https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

# Yellow Data of 2021 
base_path = 'Raw_data'

In [42]:
# getting the name of all files.
# link to source data files https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

categories = []
for filename in os.listdir(base_path): 
  categories.append(filename)
print(categories)   

['yellow_tripdata_2021-01.parquet', 'yellow_tripdata_2021-02.parquet', 'yellow_tripdata_2021-03.parquet', 'yellow_tripdata_2021-04.parquet', 'yellow_tripdata_2021-05.parquet', 'yellow_tripdata_2021-07.parquet', 'yellow_tripdata_2021-08.parquet', 'yellow_tripdata_2021-09.parquet', 'yellow_tripdata_2021-10.parquet', 'yellow_tripdata_2021-11.parquet', 'yellow_tripdata_2021-12.parquet']


In [43]:

# for i in range (1,6):
#   file_path = base_path + '/' + 'yellow_tripdata_2021-%02d.parquet' % i 
#   # Read data file
#   df = pd.read_parquet(file_path)
#   df.tail(10)


In [44]:
# There are 19 columns and check NaN values
def check_NaN(df):
  display(df.info())
  print('-'*20)
  display(df.isnull().sum())


## Problem: 
Find the demand (pick up times) given a specific time and region. 

# I. Data Cleaning:


## 1. Keeping all categories that are necessary:
- tpep_pickup_datetime
- tpep_dropoff_datetime
- passenger_count
- trip_distance
- PULocationID
- DOLocationID


In [45]:
def getting_valuable_columns_and_fill_NaN(df):
  df = df[['tpep_pickup_datetime','tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID']]
  # filling NaN in passenger_count column to 0
  df = df.fillna(0) 
  return df

## 2. Cleaning noisy data and adding One-hot coding vector 

### 2.1 EDA for 2021 Jan - May

1. Jan:
- Fri: Jan 01: New Year's Day
- Mon: Jan 18: Martin Luther King Jr. Day

2. Feb:
- Fri: Feb 12: Lincoln's Birthday
- Mon: Feb 15: Presidents' Day

3. May:
- Mon: May 31: Memorial Day


Source: https://www1.nyc.gov/assets/opa/downloads/pdf/2021-list-of-holidays.pdf



#### 2.2 Removing Outlier trips using trip_duration, trip_speed
a. trip_duration
- Less than or equal to 0 trip_duration
- According to Taxi and Limousine Commission (TLC) regulations:
      The maximum allowed trip duration in a 24 hour interval is 12 hours. 
      --> trip_duration <= 12 hours == 720 minutes

b. trip_speed
- Maximum allowed speed is 65mph

c. trip distance
- 0.0 trip distance
- trip distance > 30.0 (less than 100 data points)




#### 2.3 Removing Outlier trips using passenger_count
- NaN passenger_count
- According to the TLC regulations 5 is the maximum number of passengers allowed in a yellow taxi
  - Toddlers and children below age 7 are allowed to sit on laps
  - Maybe it takes into account the toddlers and children but the number data of 7 ,8 and 9 are ridiculous.

In [46]:
def noisy_data_add_features(df, data_month):
  isHoliday = []
  DayofWeek = []
  noisydata = []
  trip_duration = []
  trip_speed = []
  holiday = [datetime.date(2021,1,1), datetime.date(2021,1,18), datetime.date(2021,2,12), datetime.date(2021,2,15), datetime.date(2021,5,31)]
  pickup_hour = []
  pickup_month = []

  for i in range(df.shape[0]):
    if df.iat[i,0].year != 2021 or df.iat[i,1].year != 2021 or df.iat[i,0].month != data_month or df.iat[i,1].month != data_month:
      noisydata.append(i)
      continue

    if df.iat[i,2] == 0.0 or df.iat[i,3] == 0.0:  # Find data with 0 number of passengers or 0.0 trip_distance
      noisydata.append(i)
      continue

    if df.iat[i,3] > 30.0: # Drop pickup with trip distance > 30.0 (less than 100 data points)
      noisydata.append(i)
      continue
    
    if df.iat[i,2] > 6.0:   # Drop pickup with more than 6 passengers (maximum capacity)
      noisydata.append(i)
      continue
    
    #   According to Taxi and Limousine Commission (TLC) regulations:
      # The maximum allowed trip duration in a 24 hour interval is 12 hours. 
      # --> trip_duration <= 12 hours == 720 minutes
    PUtime = (df.iat[i,0]).value/10e8 # in seconds
    DOtime = (df.iat[i,1]).value/10e8
    duration = (DOtime - PUtime)/60.0 # in minutes
    if duration <= 0.0 or duration > (12 * 60.0) or duration > 70.0:
      noisydata.append(i)
      continue

    # Checking allowed speed 65mph
    speed = df.iat[i,3] / (duration / 60.0)
    if speed <= 0.0 or speed > 65.0:
      noisydata.append(i)
      continue

    trip_duration.append(duration)
    trip_speed.append(speed)
    # Adding one hot coding vector related to pickup hour, month
    pickup_hour.append(df.iat[i,0].hour)
    pickup_month.append(df.iat[i,0].month)

    # Adding one hot coding vector related to weekdays, weekends
    # Monday : 0 ~ Sunday: 6
    DayofWeek.append((df.iat[i,0]).day_of_week)

    # Adding one hot coding vector related to Holiday in Jan
    if df.iat[i,0] in holiday:
      isHoliday.append(1)
    else:
      isHoliday.append(0)


  print("Length of noisy data:", len(noisydata)) 
  # dropping noisy data and add more features to dataframe
  df = df.drop(noisydata)
  df = df.reset_index()
  df = df.drop('index', axis = 1)
  df.insert(3, "trip_duration", trip_duration, True)
  df.insert(5, "trip_speed", trip_speed, True)
  df['DayofWeek'] = DayofWeek
  df['isHoliday'] = isHoliday
  df['pickup_hour'] = pickup_hour
  df['pickup_month'] = pickup_month

  return df

  

 
 


In [47]:
# print(len(noisydata)) 
# print(len(isHoliday))
# print(len(DayofWeek))

In [48]:
# df = df.drop(noisydata)
# df = df.reset_index()
# df = df.drop('index', axis = 1)
# df.insert(3, "trip_duration", trip_duration, True)
# df.insert(5, "trip_speed", trip_speed, True)
# df['DayofWeek'] = DayofWeek
# df['isHoliday'] = isHoliday
# df['pickup_hour'] = pickup_hour
# df['pickup_month'] = pickup_month


# II. Preprocessing data files

In [49]:
for i in range (1,6):
  print("Reading yellow_tripdata_2021-%02d.parquet' file." % i)
  file_path = base_path + '/' + 'yellow_tripdata_2021-%02d.parquet' % i 
  # Read data file
  df = pd.read_parquet(file_path)
  df.tail(10)
  print("Check all catergories of the data and NaN values.")
  check_NaN(df)
  df = getting_valuable_columns_and_fill_NaN(df)
  df
  df = noisy_data_add_features(df, i)
  df
  if i == 1:
    whole_df = df
    
  whole_df = pd.concat([whole_df, df])
  df.to_parquet('Preprocessed_data/eda_yellow_tripdata_2021-%02d.parquet' % i)
  plot(df)

Reading yellow_tripdata_2021-01.parquet' file.
Check all catergories of the data and NaN values.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[ns]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64     

None

--------------------


VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count            98352
trip_distance                  0
RatecodeID                 98352
store_and_fwd_flag         98352
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge       98352
airport_fee              1369764
dtype: int64

  if df.iat[i,0] in holiday:


Length of noisy data: 148453


  0%|          | 0/474 [00:00<?, ?it/s]

Reading yellow_tripdata_2021-02.parquet' file.
Check all catergories of the data and NaN values.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371709 entries, 0 to 1371708
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1371709 non-null  int64         
 1   tpep_pickup_datetime   1371709 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1371709 non-null  datetime64[ns]
 3   passenger_count        1273463 non-null  float64       
 4   trip_distance          1371709 non-null  float64       
 5   RatecodeID             1273463 non-null  float64       
 6   store_and_fwd_flag     1273463 non-null  object        
 7   PULocationID           1371709 non-null  int64         
 8   DOLocationID           1371709 non-null  int64         
 9   payment_type           1371709 non-null  int64         
 10  fare_amount            1371709 non-null  float64     

None

--------------------


VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count            98246
trip_distance                  0
RatecodeID                 98246
store_and_fwd_flag         98246
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge       98246
airport_fee              1371708
dtype: int64

  if df.iat[i,0] in holiday:


Length of noisy data: 146806


  0%|          | 0/474 [00:00<?, ?it/s]

Reading yellow_tripdata_2021-03.parquet' file.
Check all catergories of the data and NaN values.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925152 entries, 0 to 1925151
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amoun

None

--------------------


VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count           127920
trip_distance                  0
RatecodeID                127920
store_and_fwd_flag        127920
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge      127920
airport_fee              1745645
dtype: int64

  if df.iat[i,0] in holiday:


Length of noisy data: 197122


  0%|          | 0/584 [00:00<?, ?it/s]

Reading yellow_tripdata_2021-04.parquet' file.
Check all catergories of the data and NaN values.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2171187 entries, 0 to 2171186
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amoun

None

--------------------


VendorID                      0
tpep_pickup_datetime          0
tpep_dropoff_datetime         0
passenger_count          128020
trip_distance                 0
RatecodeID               128020
store_and_fwd_flag       128020
PULocationID                  0
DOLocationID                  0
payment_type                  0
fare_amount                   0
extra                         0
mta_tax                       0
tip_amount                    0
tolls_amount                  0
improvement_surcharge         0
total_amount                  0
congestion_surcharge     128020
airport_fee              128096
dtype: int64

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/2431663123.py", line 11, in <module>
    df = noisy_data_add_features(df, i)
  File "/tmp/ipykernel_1363/4292361570.py", line 16, in noisy_data_add_features
    if df.iat[i,2] == 0.0 or df.iat[i,3] == 0.0:  # Find data with 0 number of passengers or 0.0 trip_distance
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/pandas/core/indexing.py", line 2382, in __getitem__
    return self.obj._get_value(*key, takeable=self._takeable)
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/pandas/core/frame.py", line 3911, in _get_value
    series = self._ixs(col, axis=1)
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/pandas/core

# III. Visualizing data

## 1. Analyze distributions 

In [None]:
plot(df)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/3248155569.py", line 1, in <module>
    plot(df)
NameError: name 'df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1006, in structured_traceback
    re

In [None]:
plot(whole_df)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/501204991.py", line 1, in <module>
    plot(whole_df)
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1006, in structured_trace

In [None]:
print("Analyzing Dropoff Location Data\n")
plot(whole_df, 'DOLocationID')


Analyzing Dropoff Location Data

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/3628766836.py", line 2, in <module>
    plot(whole_df, 'DOLocationID')
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1006, in

In [None]:
plot(whole_df, 'PULocationID')


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/727462609.py", line 1, in <module>
    plot(whole_df, 'PULocationID')
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1006, in 

In [None]:
print("Analyzing correlation between pickup hour and day of week\n")
plot(whole_df,'DayofWeek','pickup_hour')

Analyzing correlation between pickup hour and day of week

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/1647285140.py", line 2, in <module>
    plot(whole_df,'DayofWeek','pickup_hour')
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", lin

In [None]:
print("Analyzing correlation between pickup hour and day of week\n")
plot(whole_df,'pickup_month','pickup_hour')

Analyzing correlation between pickup hour and day of week

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/3999914158.py", line 2, in <module>
    plot(whole_df,'pickup_month','pickup_hour')
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", 

In [None]:
# Pickups per days


morning_pickup = []
afternoon_pickup = []
evening_pickup = []
midnight_pickup = []
for i in range(0,7):
    morning_pickup.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] >= 6) & (whole_df['pickup_hour'] < 12)].shape[0])
    afternoon_pickup.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] >= 12) & (whole_df['pickup_hour'] < 18)].shape[0])
    evening_pickup.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] >= 18) & (whole_df['pickup_hour'] <= 23)].shape[0])
    midnight_pickup.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] >= 0) & (whole_df['pickup_hour'] < 6)].shape[0])

pickup_per_day = pd. DataFrame({'morning':morning_pickup, 'afternoon':afternoon_pickup, 'evening':evening_pickup, 'midnight':midnight_pickup })
pickup_per_day.plot(kind='bar', stacked=True,figsize=(4.5, 4.5))
plt.xlabel('Days of the week')
plt.ylabel('Number of pickups')
plt.tick_params(axis='both', which='minor')
plt.plot()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/2704727351.py", line 9, in <module>
    morning_pickup.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] >= 6) & (whole_df['pickup_hour'] < 12)].shape[0])
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kuron

In [None]:
# pickup_in_day
hour_pickups = []
temp = []
for i in range(0,7):
    for j in range(0,24):
        temp.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] == j)].shape[0])
    hour_pickups.append(temp)
    temp = []
colors = ['xkcd:blue','xkcd:orange','xkcd:brown','xkcd:coral','xkcd:magenta','xkcd:green','xkcd:fuchsia']
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

plt.figure(figsize=(8,4))
hours_lis = [s for s in range(0,24)]
for k in range(0,7):
    plt.plot(hours_lis,hour_pickups[k],colors[k],label = days[k])
    plt.plot(hours_lis,hour_pickups[k], 'ro',  markersize=2)

plt.xticks([s for s in range(0,24)])
plt.xlabel('Hours of a day')
plt.ylabel('Number of pickups')
plt.title('Pickups for every hour')
plt.legend()
plt.grid(True)
plt.show()

hour_pickup_month = []
for j in range(0,24):
    hour_pickup_month.append(whole_df[whole_df['pickup_hour'] == j].shape[0])

plt.figure(figsize=(8,4))
hours_lis = [s for s in range(0,24)]
plt.plot(hours_lis,hour_pickup_month,'xkcd:magenta',label = 'average pickups per hour')
plt.plot(hours_lis,hour_pickup_month, 'ro',  markersize=2)

plt.xticks([s for s in range(0,24)])
plt.xlabel('Hours of a day')
plt.ylabel('Number of pickups')
plt.title('Pickups for every hour for whole of month')
plt.legend()
plt.grid(True)
plt.show()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/281236983.py", line 6, in <module>
    temp.append(whole_df[(whole_df['DayofWeek'] == i) & (whole_df['pickup_hour'] == j)].shape[0])
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/p

In [None]:
create_report(whole_df)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1363/719329431.py", line 1, in <module>
    create_report(whole_df)
NameError: name 'whole_df' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2052, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/mnt/d/kurone/UNIST/5thSemester/Deep_Learning/venv/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1006, in structu