# Parse Month and Day for Near North

In [None]:
%matplotlib inline
import seaborn as sns
import math
import pandas as pd
import numpy as np
import scipy as sci
from scipy import stats
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from dateutil.parser import parse

## Read files and Pull data only from the columns in interests

In [3]:
pd.set_option('mode.chained_assignment', None)
# Read the data from each month in 2016 and save them into monthly dataframe called jan, feb, mar,...
jan = pd.read_csv('chicago_taxi_trips_2016_01.csv')
feb = pd.read_csv('chicago_taxi_trips_2016_02.csv')
mar = pd.read_csv('chicago_taxi_trips_2016_03.csv')
apr = pd.read_csv('chicago_taxi_trips_2016_04.csv')
may = pd.read_csv('chicago_taxi_trips_2016_05.csv')
june = pd.read_csv('chicago_taxi_trips_2016_06.csv')
july = pd.read_csv('chicago_taxi_trips_2016_07.csv')
aug = pd.read_csv('chicago_taxi_trips_2016_08.csv')
sept = pd.read_csv('chicago_taxi_trips_2016_09.csv')
octo = pd.read_csv('chicago_taxi_trips_2016_10.csv')
nov = pd.read_csv('chicago_taxi_trips_2016_11.csv')
dec = pd.read_csv('chicago_taxi_trips_2016_12.csv')

# Clean (Remove) the columns and rows that are not applicable, unknown or missing values from each month
top3_16 = pd.DataFrame()
top = []
for mo in jan, feb, mar, apr, may, june, july, aug, sept, octo, nov, dec:
    mo = mo[mo['pickup_community_area'].isin([8.0])]  # 8: Near North, 32: Loop, 28: Near West
    mo = mo.drop(['fare','trip_miles','dropoff_community_area','trip_seconds','pickup_latitude','pickup_longitude','taxi_id','trip_end_timestamp','dropoff_latitude','dropoff_longitude','tips','pickup_census_tract', 'dropoff_census_tract','tolls','extras','trip_total','payment_type','company'], axis=1)
    mo = mo[(mo['pickup_community_area'].notnull())]

    top.append(mo)

top3_16 = pd.concat(top, axis=0, join='outer', ignore_index=True)
top3_16 = top3_16.drop('Unnamed: 0', 1)

top3_16.info()
top3_16.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5724889 entries, 0 to 5724888
Data columns (total 2 columns):
pickup_community_area    float64
trip_start_timestamp     object
dtypes: float64(1), object(1)
memory usage: 87.4+ MB


Unnamed: 0,pickup_community_area,trip_start_timestamp
0,8.0,1/13/16 13:30
1,8.0,1/15/16 23:30
2,8.0,1/15/16 18:15
3,8.0,1/15/16 2:45
4,8.0,1/23/16 17:00


## Parse 'time' from trip_start_timestamp column

In [4]:
top3_16['time'] = top3_16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%X'))
top3_16.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time
0,8.0,1/13/16 13:30,13:30:00
1,8.0,1/15/16 23:30,23:30:00
2,8.0,1/15/16 18:15,18:15:00
3,8.0,1/15/16 2:45,02:45:00
4,8.0,1/23/16 17:00,17:00:00


## Save the dataframe into a .csv file

### This new file will be read to parse 'hour' data from the time column in a different file.

In [5]:
time_near_north = top3_16.to_csv('time_series_pickup_time_near_north.csv', encoding='utf-8', index=False)
time_near_north = pd.read_csv('time_series_pickup_time_near_north.csv')
time_near_north.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time
0,8.0,1/13/16 13:30,13:30:00
1,8.0,1/15/16 23:30,23:30:00
2,8.0,1/15/16 18:15,18:15:00
3,8.0,1/15/16 2:45,02:45:00
4,8.0,1/23/16 17:00,17:00:00


## Parse 'month' data from trip_start_timestamp

In [35]:
top3_16['month'] = top3_16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%m'))
top3_16.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,month
0,8.0,1/13/16 13:30,1
1,8.0,1/15/16 23:30,1
2,8.0,1/15/16 18:15,1
3,8.0,1/15/16 2:45,1
4,8.0,1/23/16 17:00,1


## Parse 'day' data from trip_start_timestamp 

In [None]:
top3_16['day'] = top3_16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%d'))
top3_16.head()

## Remove the unnecessary column 

In [37]:
top3_16 = top3_16.drop('trip_start_timestamp', 1)
top3_16.info()
top3_16.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5724889 entries, 0 to 5724888
Data columns (total 3 columns):
pickup_community_area    float64
month                    object
day                      object
dtypes: float64(1), object(2)
memory usage: 131.0+ MB


Unnamed: 0,pickup_community_area,month,day
0,8.0,1,13
1,8.0,1,15
2,8.0,1,15
3,8.0,1,15
4,8.0,1,23


## Save the reduced dataframe into a .csv file 

### This new file will be read to plot a time series frequency graph in a different file. 

In [38]:
time_series_freq_near_north = top3_16.to_csv('time_series_pickup_freq_near_north.csv', encoding='utf-8', index=False)
time_series_freq_near_north = pd.read_csv('time_series_pickup_freq_near_north.csv')
time_series_freq_near_north.head()

Unnamed: 0,pickup_community_area,month,day
0,8.0,1,13
1,8.0,1,15
2,8.0,1,15
3,8.0,1,15
4,8.0,1,23
