### Average Trip Duration Based off Age and Gender

#### Create New CSV With the Average Trip Duration for People of the Same Gender and Age in a Given Month, Year

In [None]:
import csv
from csv import reader
from collections import Counter, defaultdict
import re
import glob

path="/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CitiBike_Data/"

avg_trip_all=[]

for file in glob.glob(path+"*.csv"):
    
    csv_x=reader(open(file, "r"), delimiter=",")
    num=re.findall(r'\b\d+\b', file)

    tripduration=[]

    # Skip header
    next(csv_x)
    for row in csv_x:
        tripduration.append((row[14], row[13], num[0], num[1], row[0]))

    # Separate the trip duration so that we can find averages for people of same age, year of birth, and gender in a given month/year
    tripdurationnested=[((a, b, c, d),e) for (a, b, c, d, e) in tripduration]

    tripdict=defaultdict(int)
    
    # Create dictionary w/ key=(gender, year of birth, month, year) and value=trip duration
    for line in tripdurationnested:
    # Extend the values (trip duration) for a given key if there is a duplicate key    
        if line[0] in tripdict:
            tripdict[line[0]].append(line[1])
        else:
            tripdict[line[0]] = [line[1]]

    tripavg={}
    for keys, values in tripdict.items():
    # Convert all trip durations from strings to integers
        values=list(map(int, values))
    # Create new dictionary with keys (same as before) and values (average trip duration)
        tripavg[keys] = round(sum(values)/len(values), 2)

    nestedtripavg=tripavg.items()
    
    finaltripavg=[(a, b, c, d, e) for ((a, b, c, d), e) in nestedtripavg]

    avg_trip_all.append(finaltripavg)
    
    with open("/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CB_real_metrics/TripDuration.csv", "w") as out:
        csv_o=csv.writer(out, delimiter=',')
        fieldnames=['Gender','Birth Year', 'Year', 'Month', 'Average Trip Duration(s)']
        csv_o.writerow(fieldnames)
    
        for row in avg_trip_all:
            csv_o.writerows(row)
 

#### Convert CSV to DataFrame and Clean Data

In [5]:
import pandas as pd
import numpy as np

tripdurationdf=pd.read_csv("/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CB_real_metrics/TripDuration.csv", na_values=[r'\N'])

tripdurationdf

# Replace Gender w/ 'NA, M, F' vs. '0, 1, 2'
tripdurationdf.Gender=tripdurationdf.Gender.replace(0, 'U')
tripdurationdf.Gender=tripdurationdf.Gender.replace(1, 'M')
tripdurationdf.Gender=tripdurationdf.Gender.replace(2, 'F')

# Drop rows with nulls
tripdurationdf.head()

# Create column for Average Trip Duration in Minutes
tripdurationdf['Average Trip Duration (min)']=round((tripdurationdf['Average Trip Duration(s)']/60), 2)

In [6]:
tripdurationdf.head()

Unnamed: 0,Gender,Birth Year,Year,Month,Average Trip Duration(s),Average Trip Duration (min)
0,F,1932.0,2013,7,716.85,11.95
1,M,1967.0,2013,7,813.83,13.56
2,F,1966.0,2013,7,939.41,15.66
3,F,1979.0,2013,7,997.33,16.62
4,M,1953.0,2013,7,877.18,14.62


#### Create DataFrame with Average Trip Duration (min) for Specific Age Ranges

In [7]:
# Find the average trip duration in minutes for age ranges
age_ranges=tripdurationdf.copy()

# Find unique birth years
x=tripdurationdf['Birth Year'].unique()
import numpy as np
np.sort(x)

# Create bins for age ranges based off the birth year values
bins=[]
for number in range(1880, 2010, 10):
   bins.append(number)

# Add age ranges column to dataframe
age_ranges['Age Ranges']=pd.cut(age_ranges['Birth Year'], precision=0, bins=bins)
del age_ranges['Birth Year']
age_ranges.sort_values('Age Ranges')

# Find the average trip duration for each age range
pivot=pd.pivot_table(age_ranges, values='Average Trip Duration (min)', columns=['Age Ranges', 'Year', 'Month', 'Gender'], aggfunc=np.mean)
pivot=pivot.fillna(0)
pivot=pd.DataFrame(pivot)
pivot=pivot.reset_index()
pivot.columns=['Age Ranges', 'Year', 'Month', 'Gender','Average Trip Duration (min)']
pivot.head()

Unnamed: 0,Age Ranges,Year,Month,Gender,Average Trip Duration (min)
0,"(1880, 1890]",2013,1,F,0.0
1,"(1880, 1890]",2013,1,M,0.0
2,"(1880, 1890]",2013,1,U,0.0
3,"(1880, 1890]",2013,2,F,0.0
4,"(1880, 1890]",2013,2,M,0.0


In [9]:
# Change age ranges to intervals that will be easier to understand in a plot
unique_ranges=pivot['Age Ranges'].unique()
from collections import defaultdict
x=defaultdict(str)

for line in unique_ranges:
    
    z=line[1:4]+'1'
    d=line[7:11]
    result=z+'-'+d
    x[line]=result

# Update dataframe to reflect new intervals
pivot['Age Range']=pivot['Age Ranges'].map(x)
pivot.head()

Unnamed: 0,Age Ranges,Year,Month,Gender,Average Trip Duration (min),Age Range
0,"(1880, 1890]",2013,1,F,0.0,1881-1890
1,"(1880, 1890]",2013,1,M,0.0,1881-1890
2,"(1880, 1890]",2013,1,U,0.0,1881-1890
3,"(1880, 1890]",2013,2,F,0.0,1881-1890
4,"(1880, 1890]",2013,2,M,0.0,1881-1890


#### Plot in Bokeh

In [10]:
pivot.head()

Unnamed: 0,Age Ranges,Year,Month,Gender,Average Trip Duration (min),Age Range
0,"(1880, 1890]",2013,1,F,0.0,1881-1890
1,"(1880, 1890]",2013,1,M,0.0,1881-1890
2,"(1880, 1890]",2013,1,U,0.0,1881-1890
3,"(1880, 1890]",2013,2,F,0.0,1881-1890
4,"(1880, 1890]",2013,2,M,0.0,1881-1890


In [11]:
import bokeh
from bokeh.plotting import *
from bokeh.models import PrintfTickFormatter

labels=['1881-1890', '1891-1900', '1901-1910', '1911-1920', '1921-1930',
       '1931-1940', '1941-1950', '1951-1960', '1961-1970', '1971-1980',
        '1981-1990', '1991-2000']

p = figure(x_range=labels, width=800, height=800)

p.title.text='Age vs. Average Trip Duration by Gender'
p.title.align='center'
p.title.text_font_size='14pt'

p.xaxis.axis_label='Age Range'
p.yaxis.axis_label='Average Trip Duration (min)'

p.xaxis.major_label_text_font_size='11.5pt'
p.xaxis.major_label_orientation=np.pi/4
p.yaxis.major_label_text_font_size='11.5pt'

p.xaxis.axis_label_standoff=20
p.yaxis.axis_label_standoff=20

p.xaxis.axis_label_text_font_size='13pt'
p.yaxis.axis_label_text_font_size='13pt'

gender_series=pivot['Gender']

men=pivot[pivot['Gender'] == 'M']
women=pivot[pivot['Gender'] == 'F']

p.vbar(x=men['Age Range'], bottom=0, top = men['Average Trip Duration (min)'], width=0.3)

In [12]:
output_notebook()

In [13]:
show(p)

In [14]:
from bokeh.io import output_notebook, show, reset_output
from bokeh.layouts import widgetbox
from bokeh.models.widgets import MultiSelect
from ipywidgets import interact

reset_output()
output_notebook()

Month = MultiSelect(title="Months:", 
                           options=[("January", "January"), 
                                    ("February", "February"), 
                                    ("March", "March"),
                                    ("April", "April"),
                                    ("May", "May"),
                                    ("June", "June"),
                                    ("July", "July"),
                                    ("August", "August"),
                                    ("September", "September"),
                                    ("October", "October"),
                                    ("November", "November"),
                                    ("December", "December")])


Year = MultiSelect(title="Years:", 
                   options=[("2013", "2013"), 
                            ("2014", "2014"), 
                            ("2015", "2015"),
                            ("2016", "2016")])



show(widgetbox(Month))

show(widgetbox(Year))
