### Average Trip Duration Based off Age and Gender

#### Create one CSV with the average trip duration for people of the same gender and age from each of the 40+ Citi Bike data files

In [None]:
import csv
from csv import reader
from collections import Counter, defaultdict
import re
import glob

path="/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CitiBike_Data/"

avg_trip_all=[]

for file in glob.glob(path+"*.csv"):
    
    csv_x=reader(open(file, "r"), delimiter=",")
    num=re.findall(r'\b\d+\b', file)

    tripduration=[]

    # Skip header
    next(csv_x)
    for row in csv_x:
        tripduration.append((row[14], row[13], num[0], num[1], row[0]))

    # Separate the trip duration so that we can find averages for people of same age, year of birth, and gender in a given month/year
    tripdurationnested=[((a, b, c, d),e) for (a, b, c, d, e) in tripduration]

    tripdict=defaultdict(int)
    
    # Create dictionary w/ key=(gender, year of birth, month, year) and value=trip duration
    for line in tripdurationnested:
    # Extend the values (trip duration) for a given key if there is a duplicate key    
        if line[0] in tripdict:
            tripdict[line[0]].append(line[1])
        else:
            tripdict[line[0]] = [line[1]]

    tripavg={}
    for keys, values in tripdict.items():
    # Convert all trip durations from strings to integers
        values=list(map(int, values))
    # Create new dictionary with keys (same as before) and values (average trip duration)
        tripavg[keys] = round(sum(values)/len(values), 2)

    nestedtripavg=tripavg.items()
    
    finaltripavg=[(a, b, c, d, e) for ((a, b, c, d), e) in nestedtripavg]

    avg_trip_all.append(finaltripavg)
    
    with open("/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CB_real_metrics/TripDuration.csv", "w") as out:
        csv_o=csv.writer(out, delimiter=',')
        fieldnames=['Gender','Birth Year', 'Year', 'Month', 'Average Trip Duration(s)']
        csv_o.writerow(fieldnames)
    
        for row in avg_trip_all:
            csv_o.writerows(row)
 

#### Data Cleaning: Drop unprovided birth years and extreme outliers ( <1920)

In [149]:
import pandas as pd
import numpy as np

tripdurationdf=pd.read_csv("/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CB_real_metrics/TripDuration.csv", na_values=[r'\N'])

tripdurationdf

# Replace Gender w/ 'Unknown, Male, Female' vs. '0, 1, 2'
tripdurationdf.Gender=tripdurationdf.Gender.replace(0, 'Unknown')
tripdurationdf.Gender=tripdurationdf.Gender.replace(1, 'Male')
tripdurationdf.Gender=tripdurationdf.Gender.replace(2, 'Female')

# Replace birth years < 1920 with nulls (see notebook "Identifying Outliers in Age Range" for reference)
tripdurationdf.ix[tripdurationdf['Birth Year'] < 1920, 'Birth Year'] = np.nan

# Create column for Average Trip Duration in Minutes
tripdurationdf['Average Trip Duration (min)']=round((tripdurationdf['Average Trip Duration(s)']/60), 2)

# Drop rows with nulls (missing birth years and those < 1920)
tripdurationdf=tripdurationdf.dropna()
tripdurationdf.head()

Unnamed: 0,Gender,Birth Year,Year,Month,Average Trip Duration(s),Average Trip Duration (min)
0,Female,1932.0,2013,7,716.85,11.95
1,Male,1967.0,2013,7,813.83,13.56
2,Female,1966.0,2013,7,939.41,15.66
3,Female,1979.0,2013,7,997.33,16.62
4,Male,1953.0,2013,7,877.18,14.62


#### Create DataFrame with Average Trip Duration (min) for Specific Age Ranges

In [150]:
# Find the average trip duration in minutes for age ranges
age_ranges=tripdurationdf.copy()

# Find unique birth years
x=tripdurationdf['Birth Year'].unique()
import numpy as np
np.sort(x)

# Create bins for age ranges based off the birth year values
bins=[]
for number in range(1915, 2015, 5):
   bins.append(number)

# Add age ranges column to dataframe
age_ranges['Age Ranges']=pd.cut(age_ranges['Birth Year'], precision=0, bins=bins)
del age_ranges['Birth Year']
age_ranges.sort_values('Age Ranges')

# Find the average trip duration for each age range
pivot=pd.pivot_table(age_ranges, values='Average Trip Duration (min)', columns=['Age Ranges', 'Year', 'Month', 'Gender'], aggfunc=np.mean)

pivot=pivot.dropna()
pivot=pd.DataFrame(pivot)
pivot=pivot.reset_index()
pivot.columns=['Age Ranges', 'Year', 'Month', 'Gender','Average Trip Duration (min)']
pivot.head()

Unnamed: 0,Age Ranges,Year,Month,Gender,Average Trip Duration (min)
0,"(1915, 1920]",2013,7,Male,13.46
1,"(1915, 1920]",2013,8,Male,7.69
2,"(1915, 1920]",2013,9,Male,16.46
3,"(1915, 1920]",2013,11,Male,24.85
4,"(1915, 1920]",2014,1,Male,20.82


In [160]:
# Change age ranges to intervals that will be easier to understand in a plot
unique_ranges=pivot['Age Ranges'].unique()
from collections import defaultdict
x=defaultdict(str)

for line in unique_ranges:
    if int(line[4]) % 2 ==0:
        z=line[1:4]+'1'
    else:
        z=line[1:4]+'6'
   
    d=line[7:11]
    result=z+'-'+d
    x[line]=result
    
pivot['Age Range']=pivot['Age Ranges'].map(x)
pivot.head()

Unnamed: 0,Age Ranges,Year,Month,Gender,Average Trip Duration (min),Age Range
0,"(1915, 1920]",2013,7,Male,13.46,1916-1920
1,"(1915, 1920]",2013,8,Male,7.69,1916-1920
2,"(1915, 1920]",2013,9,Male,16.46,1916-1920
3,"(1915, 1920]",2013,11,Male,24.85,1916-1920
4,"(1915, 1920]",2014,1,Male,20.82,1916-1920


#### Create an interactive bar plot with ipywidget and Seaborn to compare the average trip duration per age group

In [181]:
age_ranges2=age_ranges.copy()
age_ranges2=age_ranges2.drop(age_ranges2.columns[[0, 3]], axis=1)
age_ranges2['Age Range']=age_ranges2['Age Ranges'].map(x)
del age_ranges2['Age Ranges']
age_ranges2=age_ranges2.groupby(['Year', 'Month', 'Age Range']).mean().reset_index()

age_ranges2.head()

Unnamed: 0,Year,Month,Age Range,Average Trip Duration (min)
0,2013,7,1916-1920,13.46
1,2013,7,1921-1925,8.67
2,2013,7,1926-1930,16.73
3,2013,7,1931-1935,16.273333
4,2013,7,1936-1940,21.074


In [188]:
from ipywidgets import interact, Layout
from ipywidgets.widgets import SelectMultiple
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap

# Create dictionary for new column in dataframe with the actual month name
months_name={}
months_name[1]='January'
months_name[2]="February"
months_name[3]= "March"
months_name[4]="April"
months_name[5]="May"
months_name[6]="June"
months_name[7]="July"
months_name[8]="August"
months_name[9]="September"
months_name[10]="October"
months_name[11]="November"
months_name[12]="December"

df=age_ranges2.copy()
df['Month Name']=df["Month"].map(months_name)

# Create lists of unique values for months and years for widget
months=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
years=sorted(df['Year'].unique().tolist())

# Define function for interactive widget
def make_plot(Months, Years):
    x=list(Months)
    z=list(Years)
    
    try:
        gooddata=df.loc[(df['Month Name'].isin(x)) & df['Year'].isin(z)].sort_values(by='Age Range')
        sns.set_style("whitegrid")
        plt.rcParams['figure.figsize']=(10,8)
        pal={'Female': "#f9a65a", 'Male': "#9e66ab", 'Unknown':"#af2b30"}
        plot=sns.pointplot('Age Range', 'Average Trip Duration (min)', data=gooddata, ci=None, color='#af2b30')
        title=plt.title("Average Trip Duration per Age Range", fontsize=16, fontweight="bold")
        title.set_position([.5, 1.05])
        plt.ylabel('Average Trip Duration (min)')
        for item in plot.get_xticklabels():
            item.set_rotation(40)
        for tick in plot.xaxis.get_major_ticks():
                    tick.label.set_fontsize(12)
        for tick in plot.yaxis.get_major_ticks():
                    tick.label.set_fontsize(12)
        plot.yaxis.label.set_size(14)
        plot.xaxis.label.set_size(14)
        plot.xaxis.labelpad = 20
        plot.yaxis.labelpad = 20
        plt.show()
    
    except ValueError:
        text="This dataset spans from July 2013 (Citi Bike's opening to the public) to December 2016. Please make another selection and try again." 
        print(textwrap.fill(text, 100))

    
interact(make_plot, Months=SelectMultiple(options=months, value=['July']), Years=SelectMultiple(options=years, value=[2013]))

<function __main__.make_plot>

#### Create an interactive bar plot with ipywidget and Seaborn to compare the average trip duration per age group grouped by gender

In [189]:
from ipywidgets import interact, Layout
from ipywidgets.widgets import SelectMultiple
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap

# Create dictionary for new column in dataframe with the actual month name
months_name={}
months_name[1]='January'
months_name[2]="February"
months_name[3]= "March"
months_name[4]="April"
months_name[5]="May"
months_name[6]="June"
months_name[7]="July"
months_name[8]="August"
months_name[9]="September"
months_name[10]="October"
months_name[11]="November"
months_name[12]="December"

df=pivot.copy()
df['Month Name']=df["Month"].map(months_name)

# Create lists of unique values for months and years for widget
months=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
years=sorted(df['Year'].unique().tolist())

# Define function for interactive widget
def make_plot2(Months, Years):
    x=list(Months)
    z=list(Years)
    
    try:
        gooddata=df.loc[(df['Month Name'].isin(x)) & df['Year'].isin(z)].sort_values(by='Age Range')
        sns.set_style("whitegrid")
        plt.rcParams['figure.figsize']=(10,8)
        pal={'Female': "#f9a65a", 'Male': "#9e66ab", 'Unknown':"#af2b30"}
        plot=sns.barplot('Age Range', 'Average Trip Duration (min)', hue='Gender', data=gooddata, ci=None, palette=pal)
        title=plt.title("Average Trip Duration by Age Range, Grouped by Gender", fontsize=16, fontweight="bold")
        title.set_position([.5, 1.05])
        plt.ylabel('Average Trip Duration (min)')
        plt.legend(loc=1)
        for item in plot.get_xticklabels():
            item.set_rotation(40)
        for tick in plot.xaxis.get_major_ticks():
                    tick.label.set_fontsize(12)
        for tick in plot.yaxis.get_major_ticks():
                    tick.label.set_fontsize(12)
        plot.yaxis.label.set_size(14)
        plot.xaxis.label.set_size(14)
        plot.xaxis.labelpad = 20
        plot.yaxis.labelpad = 20
        plt.show()
    
    except ValueError:
        text="This dataset spans from July 2013 (Citi Bike's opening to the public) to December 2016. Please make another selection and try again." 
        print(textwrap.fill(text, 100))

    
interact(make_plot2, Months=SelectMultiple(options=months, value=['July']), Years=SelectMultiple(options=years, value=[2013]))

<function __main__.make_plot2>