# Parse Month and Day for Near West

In [21]:
%matplotlib inline
import seaborn as sns
import math
import pandas as pd
import numpy as np
import scipy as sci
from scipy import stats
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from dateutil.parser import parse

## Read files and Pull data only from the columns in interests

In [22]:
pd.set_option('mode.chained_assignment', None)
# Read the data from each month in 2016 and save them into monthly dataframe called jan, feb, mar,...
jan = pd.read_csv('chicago_taxi_trips_2016_01.csv')
feb = pd.read_csv('chicago_taxi_trips_2016_02.csv')
mar = pd.read_csv('chicago_taxi_trips_2016_03.csv')
apr = pd.read_csv('chicago_taxi_trips_2016_04.csv')
may = pd.read_csv('chicago_taxi_trips_2016_05.csv')
june = pd.read_csv('chicago_taxi_trips_2016_06.csv')
july = pd.read_csv('chicago_taxi_trips_2016_07.csv')
aug = pd.read_csv('chicago_taxi_trips_2016_08.csv')
sept = pd.read_csv('chicago_taxi_trips_2016_09.csv')
octo = pd.read_csv('chicago_taxi_trips_2016_10.csv')
nov = pd.read_csv('chicago_taxi_trips_2016_11.csv')
dec = pd.read_csv('chicago_taxi_trips_2016_12.csv')

# Clean (Remove) the columns and rows that are not applicable, unknown or missing values from each month
near_west_pickup16 = pd.DataFrame()
top_pickup = []
for mo in jan, feb, mar, apr, may, june, july, aug, sept, octo, nov, dec:
    mo = mo[mo['pickup_community_area'].isin([28.0])]  # 8: Near North, 32: Loop, 28: Near West
    mo = mo.drop(['fare','trip_miles','trip_seconds','pickup_latitude','pickup_longitude','dropoff_community_area','taxi_id','trip_end_timestamp','dropoff_latitude','dropoff_longitude','tips','pickup_census_tract', 'dropoff_census_tract','tolls','extras','trip_total','payment_type','company'], axis=1)
    mo = mo[(mo['pickup_community_area'].notnull())]

    top_pickup.append(mo)

near_west_pickup16 = pd.concat(top_pickup, axis=0, join='outer', ignore_index=True)
near_west_pickup16 = near_west_pickup16.drop('Unnamed: 0', 1)

near_west_pickup16.info()
near_west_pickup16.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526493 entries, 0 to 1526492
Data columns (total 2 columns):
pickup_community_area    1526493 non-null float64
trip_start_timestamp     1526493 non-null object
dtypes: float64(1), object(1)
memory usage: 23.3+ MB


Unnamed: 0,pickup_community_area,trip_start_timestamp
0,28.0,1/19/16 8:45
1,28.0,1/29/16 17:45
2,28.0,1/4/16 9:00
3,28.0,1/24/16 10:15
4,28.0,1/11/16 13:00


## Parse 'time' from trip_start_timestamp column 

In [23]:
near_west_pickup16['time'] = near_west_pickup16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%X'))
near_west_pickup16.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time
0,28.0,1/19/16 8:45,08:45:00
1,28.0,1/29/16 17:45,17:45:00
2,28.0,1/4/16 9:00,09:00:00
3,28.0,1/24/16 10:15,10:15:00
4,28.0,1/11/16 13:00,13:00:00


## Save the dataframe into a .csv file

### This new file will be read to parse 'hour' data from the time column in a different file.

In [24]:
time_near_west = near_west_pickup16.to_csv('time_series_pickup_time_near_west.csv', encoding='utf-8', index=False)
time_near_west = pd.read_csv('time_series_pickup_time_near_west.csv')
time_near_west.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time
0,28.0,1/19/16 8:45,08:45:00
1,28.0,1/29/16 17:45,17:45:00
2,28.0,1/4/16 9:00,09:00:00
3,28.0,1/24/16 10:15,10:15:00
4,28.0,1/11/16 13:00,13:00:00


## Parse 'month' data from trip_start_timestamp

In [25]:
near_west_pickup16['month'] = near_west_pickup16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%m'))
near_west_pickup16.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time,month
0,28.0,1/19/16 8:45,08:45:00,1
1,28.0,1/29/16 17:45,17:45:00,1
2,28.0,1/4/16 9:00,09:00:00,1
3,28.0,1/24/16 10:15,10:15:00,1
4,28.0,1/11/16 13:00,13:00:00,1


## Parse 'day' data from trip_start_timestamp

In [26]:
near_west_pickup16['day'] = near_west_pickup16['trip_start_timestamp'].apply(lambda x: parse(x).strftime('%d'))
near_west_pickup16.head()

Unnamed: 0,pickup_community_area,trip_start_timestamp,time,month,day
0,28.0,1/19/16 8:45,08:45:00,1,19
1,28.0,1/29/16 17:45,17:45:00,1,29
2,28.0,1/4/16 9:00,09:00:00,1,4
3,28.0,1/24/16 10:15,10:15:00,1,24
4,28.0,1/11/16 13:00,13:00:00,1,11


## Remove the unnecessary column

In [27]:
near_west_pickup16 = near_west_pickup16.drop('trip_start_timestamp', 1)
near_west_pickup16.info()
near_west_pickup16.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526493 entries, 0 to 1526492
Data columns (total 4 columns):
pickup_community_area    1526493 non-null float64
time                     1526493 non-null object
month                    1526493 non-null object
day                      1526493 non-null object
dtypes: float64(1), object(3)
memory usage: 46.6+ MB


Unnamed: 0,pickup_community_area,time,month,day
0,28.0,08:45:00,1,19
1,28.0,17:45:00,1,29
2,28.0,09:00:00,1,4
3,28.0,10:15:00,1,24
4,28.0,13:00:00,1,11


## Save the reduced dataframe into a .csv file 

### This new file will be read to plot a time series frequency graph in a different file.

In [28]:
time_series_freq_near_west = near_west_pickup16.to_csv('time_series_pickup_freq_near_west.csv', encoding='utf-8', index=False)
time_series_freq_near_west = pd.read_csv('time_series_pickup_freq_near_west.csv')
time_series_freq_near_west.head()

Unnamed: 0,pickup_community_area,time,month,day
0,28.0,08:45:00,1,19
1,28.0,17:45:00,1,29
2,28.0,09:00:00,1,4
3,28.0,10:15:00,1,24
4,28.0,13:00:00,1,11
