In [None]:
## import all necessary packages and functions.
import csv # read and write csv files
from datetime import datetime # operations to parse dates
from pprint import pprint # use to print data structures like dictionaries in
                          # a nicer way than the base print function.


In [None]:
def print_first_point(filename):
    """
    This function prints and returns the first data point (second row) from
    a csv file that includes a header row.
    """
    # print city name for reference
    city = filename.split('-')[0].split('/')[-1]
    print('\nCity: {}'.format(city))
    
    with open(filename, 'r') as f_in:
        ## TODO: Use the csv library to set up a DictReader object. ##
        ## see https://docs.python.org/3/library/csv.html           ##
        trip_reader = csv.DictReader(f_in)
        
        ## TODO: Use a function on the DictReader object to read the     ##
        ## first trip from the data file and store it in a variable.     ##
        ## see https://docs.python.org/3/library/csv.html#reader-objects ##
        first_trip = next(trip_reader)
        
        ## TODO: Use the pprint library to print the first trip. ##
        ## see https://docs.python.org/3/library/pprint.html     ##
        pprint(first_trip)
        
    # output city name and first trip for later testing
    return (city, first_trip)

# list of files for each city
data_files = ['./data/NYC-CitiBike-2016.csv',
              './data/Chicago-Divvy-2016.csv',
              './data/Washington-CapitalBikeshare-2016.csv',]

# print the first trip from each file, store in dictionary
example_trips = {}
for data_file in data_files:
    city, first_trip = print_first_point(data_file)
    example_trips[city] = first_trip

In [None]:
def duration_in_mins(datum, city):
    """
    Takes as input a dictionary containing info about a single trip (datum) and
    its origin city (city) and returns the trip duration in units of minutes.
    
    Remember that Washington is in terms of milliseconds while Chicago and NYC
    are in terms of seconds. 
    """
    
    if city == 'NYC' or city == 'Chicago':
        duration = int(datum['tripduration'])
        duration = duration/60 #convert seconds to minutes
        
    if city == 'Washington':
        duration = int(datum['Duration (ms)'])
        duration = duration/60000 #convert milliseconds to minutes
        
    return duration


# Some tests to check that the code works.
tests = {'NYC': 13.9833,
         'Chicago': 15.4333,
         'Washington': 7.1231}

for city in tests:
    assert abs(duration_in_mins(example_trips[city], city) - tests[city]) < .001

In [None]:
from datetime import datetime
import calendar

def time_of_trip(datum, city):
    """
    Takes as input a dictionary containing info about a single trip (datum) and
    its origin city (city) and returns the month, hour, and day of the week in
    which the trip was made.
    
    Remember that NYC includes seconds, while Washington and Chicago do not.
    """
    
    if city == 'NYC':
        date = datum['starttime']
        date = datetime.strptime(date, '%m/%d/%Y %H:%M:%S')
        
    if city == 'Chicago':
        date = datum['starttime']
        date = datetime.strptime(date, '%m/%d/%Y %H:%M')
        
    if city == 'Washington':
        date = datum['Start date']
        date = datetime.strptime(date, '%m/%d/%Y %H:%M')
        
    month = date.month
    hour = date.hour
    day_of_week = calendar.day_name[date.weekday()]
    
    return (month, hour, day_of_week)


# Some tests to check that the code works. 
tests = {'NYC': (1, 0, 'Friday'),
         'Chicago': (3, 23, 'Thursday'),
         'Washington': (3, 22, 'Thursday')}

for city in tests:
    assert time_of_trip(example_trips[city], city) == tests[city]

In [None]:
def type_of_user(datum, city):
    """
    Takes as input a dictionary containing info about a single trip (datum) and
    its origin city (city) and returns the type of system user that made the
    trip.
    
    Remember that Washington has different category names compared to Chicago
    and NYC. 
    """
    
    if city == 'NYC' or city == 'Chicago':
        user_type = datum['usertype']
        
    if city == 'Washington':
        user_type = datum['Member Type']
        if user_type == 'Registered':
            user_type = 'Subscriber'
        if user_type == 'Casual':
            user_type = 'Customer'
    
    return user_type


# Some tests to check that your code works. There should be no output if all of
# the assertions pass. The `example_trips` dictionary was obtained from when
# you printed the first trip from each of the original data files.
tests = {'NYC': 'Customer',
         'Chicago': 'Subscriber',
         'Washington': 'Subscriber'}

for city in tests:
    assert type_of_user(example_trips[city], city) == tests[city]

In [None]:
def condense_data(in_file, out_file, city):
    """
    This function takes full data from the specified input file
    and writes the condensed data to a specified output file. The city
    argument determines how the input file will be parsed.
    
    HINT: See the cell below to see how the arguments are structured!
    """
    
    with open(out_file, 'w') as f_out, open(in_file, 'r') as f_in:
        # set up csv DictWriter object - writer requires column names for the
        # first row as the "fieldnames" argument
        out_colnames = ['duration', 'month', 'hour', 'day_of_week', 'user_type']        
        trip_writer = csv.DictWriter(f_out, fieldnames = out_colnames)
        trip_writer.writeheader()
        
        ## TODO: set up csv DictReader object ##
        trip_reader = csv.DictReader(f_in)

        # collect data from and process each row
        for row in trip_reader:
            # set up a dictionary to hold the values for the cleaned and trimmed
            # data point
            new_point = {}

            ## TODO: use the helper functions to get the cleaned data from  ##
            ## the original data dictionaries.                              ##
            ## Note that the keys for the new_point dictionary should match ##
            ## the column names set in the DictWriter object above.         ##
            new_point['duration'] = duration_in_mins(row, city)
            new_point['month'],new_point['hour'],new_point['day_of_week'] = time_of_trip(row, city)
            new_point['usertype'] = type_of_user(row, city)

            ## TODO: write the processed information to the output file.     ##
            ## see https://docs.python.org/3/library/csv.html#writer-objects ##

            w = csv.DictWriter(f_out, new_point.keys())
            w.writerow(new_point)
            

In [None]:
# Run this cell to check your work
city_info = {'Washington': {'in_file': './data/Washington-CapitalBikeshare-2016.csv',
                            'out_file': './data/Washington-2016-Summary.csv'},
             'Chicago': {'in_file': './data/Chicago-Divvy-2016.csv',
                         'out_file': './data/Chicago-2016-Summary.csv'},
             'NYC': {'in_file': './data/NYC-CitiBike-2016.csv',
                     'out_file': './data/NYC-2016-Summary.csv'}}

for city, filenames in city_info.items():
    condense_data(filenames['in_file'], filenames['out_file'], city)
    print_first_point(filenames['out_file'])

In [None]:
def number_of_trips(filename):
    """
    This function reads in a file with trip data and reports the number of
    trips made by subscribers, customers, and total overall.
    """
    with open(filename, 'r') as f_in:
        # set up csv reader object
        reader = csv.DictReader(f_in)
        
        # initialize count variables
        n_subscribers = 0
        n_customers = 0
        
        # tally up ride types
        for row in reader:
            if row['user_type'] == 'Subscriber':
                n_subscribers += 1
            else:
                n_customers += 1
        
        # compute total number of rides
        n_total = n_subscribers + n_customers
        #proportion of trips by subscribers
        p_subscribers = n_subscribers/n_total
        #proportion of trips by customers
        p_customers = n_customers/n_total
        
        # return tallies as a tuple
        return(n_total,round(p_subscribers,2),round(p_customers,2))

In [None]:
#get total subscribers, subscriber proportion, and customer proportion
data_files = ['./data/Washington-2016-Summary.csv',
              './data/Chicago-2016-Summary.csv',
              './data/NYC-2016-Summary.csv']

for i in data_files:
    print(number_of_trips(i))

In [None]:
## Use this and additional cells to answer Question 4b.                 ##
##                                                                      ##
## HINT: The csv module reads in all of the data as strings, including  ##
## numeric values. You will need a function to convert the strings      ##
## into an appropriate numeric type before you aggregate data.          ##
## TIP: For the Bay Area example, the average trip length is 14 minutes ##
## and 3.5% of trips are longer than 30 minutes.                        ##

def trip_length(filename):
    """
    This function reads in a file with trip data and reports the average trip length
    and percentage of trips longer than 30 minutes.
    """
    with open(filename, 'r') as f_in:
        # set up csv reader object
        reader = csv.DictReader(f_in)
        
        # initialize count variables
        total_trip_length = 0
        longer_trips = 0
        count = 0

        for row in reader:
            #get total trip length across all rows
            total_trip_length += float(row['duration'])
            #get number of rows
            count += 1
            #get trips with duration greater than 30 minutes
            if float(row['duration']) > 30:
                longer_trips += 1
            
        #average trip length
        avg_trip_length = total_trip_length/count
        #proportion of trips longer than 30 minutes
        longer_trip_prop = longer_trips/count
        
        # return results as a tuple
        return(round(avg_trip_length,2),str(round(longer_trip_prop*100,2))+'%')

In [None]:
#get average trip length, proportion of trips longer than 30 minutes
for i in data_files:
    print(trip_length(i))

In [None]:
## Use this and additional cells to answer Question 4c. If you have    ##
## not done so yet, consider revising some of your previous code to    ##
## make use of functions for reusability.                              ##
##                                                                     ##
## TIP: For the Bay Area example data, you should find the average     ##
## Subscriber trip duration to be 9.5 minutes and the average Customer ##
## trip duration to be 54.6 minutes. Do the other cities have this     ##
## level of difference?  ##

def membership_trip_duration(filename):
    """
    This function reads in a file with trip data and reports the average trip duration for 
    customers and subscribers.
    """
    with open(filename, 'r') as f_in:
        # set up csv reader object
        reader = csv.DictReader(f_in)
        
        # initialize count variables
        n_subscribers = 0
        subscriber_duration = 0
        n_customers = 0
        customer_duration = 0
        
        # count average trip duration for subscribers and customers
        for row in reader:
            if row['user_type'] == 'Subscriber':
                n_subscribers += 1
                subscriber_duration += float(row['duration'])
            else:
                n_customers += 1
                customer_duration += float(row['duration'])
        
        avg_customer_dur = customer_duration/n_customers
        avg_subscriber_dur = subscriber_duration/n_subscribers
        
        # return tallies as a tuple
        return(round(avg_customer_dur,2),round(avg_subscriber_dur,2))

In [None]:
#get average trip duration for customers and subscribers
for i in data_files:
    print(membership_trip_duration(i))

In [None]:
# load library
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# this is a 'magic word' that allows for plots to be displayed
# inline with the notebook. If you want to know more, see:
# http://ipython.readthedocs.io/en/stable/interactive/magics.html
%matplotlib inline 

# example histogram, data taken from bay area sample
data = [ 7.65,  8.92,  7.42,  5.50, 16.17,  4.20,  8.98,  9.62, 11.48, 14.33,
        19.02, 21.53,  3.90,  7.97,  2.62,  2.67,  3.08, 14.40, 12.90,  7.83,
        25.12,  8.30,  4.93, 12.43, 10.60,  6.17, 10.88,  4.78, 15.15,  3.53,
         9.43, 13.32, 11.72,  9.85,  5.22, 15.10,  3.95,  3.17,  8.78,  1.88,
         4.55, 12.68, 12.38,  9.78,  7.63,  6.45, 17.38, 11.90, 11.52,  8.63,]
plt.hist(data,bins=15)
plt.title('Distribution of Trip Durations')
plt.xlabel('Duration (m)')
plt.show()

In [None]:
## Use this and additional cells to collect all of the trip times as a list ##
## and then use pyplot functions to generate a histogram of trip times.     ##

def collect_trip_durations(filename):
    """
    This function reads in a file with trip data and returns a list of trip durations.
    """
    with open(filename, 'r') as f_in:
        # set up csv reader object
        reader = csv.DictReader(f_in)
        
        # initialize count variables
        subscriber_trip_dur = []
        customer_trip_dur = []
        
        for row in reader:
            if row['user_type'] == 'Subscriber':
                subscriber_trip_dur.append(float(row['duration']))
            else:
                customer_trip_dur.append(float(row['duration']))
                
        trip_durations = customer_trip_dur + subscriber_trip_dur
        
    #return list of trip durations
    return customer_trip_dur,subscriber_trip_dur,trip_durations

In [None]:
#plot the trip duration histogram for the cities
cities = ['Washington','Chicago','NYC']

for i in range(len(data_files)):
    customer,subscriber,total = collect_trip_durations(data_files[i])
    plt.hist(total,bins=750)
    plt.title(f'Distribution of Trip Durations for {cities[i]}')
    plt.xlabel('Duration (m)')
    plt.show()

In [None]:
## Use this and additional cells to answer Question 5. ##

#create bins with spacing of 5 minute intervals
def bin_spacing(data):
    #data range: hard cap at 75 min, round down float of min subscriber datum
    data_range = range(int(min(data)),75)
    #divide the length of data_range by 5 to get 5 minute interval bins
    return int(len(data_range)/5)
    
    
for i in range(len(data_files)):
    customer,subscriber,total = collect_trip_durations(data_files[i])
    
    plt.figure(figsize=(16, 5))
    #plot subscriber histogram
    plt.subplot(1, 2, 1)
    plt.hist(subscriber,bins=bin_spacing(data),range = (min(subscriber),75))
    plt.title(f'Distribution of Subscriber Trip Durations for {cities[i]}')
    plt.xlabel('Duration (m)')
    
    #plot customer histogram
    plt.subplot(1, 2, 2)
    plt.hist(customer,bins=bin_spacing(data),range = (min(customer),75))
    plt.title(f'Distribution of Customer Trip Durations for {cities[i]}')
    plt.xlabel('Duration (m)')
    plt.show()

In [None]:
from collections import defaultdict

def monthly_ridership(filename):
    """
    This function reads in a file with trip data and reports the subscriber and customer
    trip count for each month
    """
    with open(filename, 'r') as f_in:
        # set up csv reader object
        reader = csv.DictReader(f_in)
        
        customer_data = defaultdict(int)
        subscriber_data = defaultdict(int)
        
        for row in reader:
            if row['user_type'] == 'Customer':
                customer_data[int(row['month'])] += 1
            else:
                subscriber_data[int(row['month'])] += 1
            
        return subscriber_data,customer_data

In [None]:
#get number of trips based on number of ride durations recorded
def monthly_ride_xy(data):
    xy = []
    for key,value in data.items():
        xy.append((key,value))
    return sorted(xy)

for i in range(len(data_files)):
    subscriber_trips,customer_trips = monthly_ridership(data_files[i])
    sub_xy = monthly_ride_xy(subscriber_trips)
    sub_x = [i[0] for i in sub_xy]
    sub_y = [i[1] for i in sub_xy]
    
    cus_xy = monthly_ride_xy(customer_trips)
    cus_x = [i[0] for i in cus_xy]
    cus_y = [i[1] for i in cus_xy]
    
    subscriber_customer_ratio = [sub/cus for sub,cus in zip(sub_y, cus_y)]
    
    plt.figure(figsize=(16, 4))
    #plot monthly subscriber ridership
    plt.subplot(1, 3, 1)
    plt.bar(sub_x,sub_y,color='#F99583')
    plt.title(f'{cities[i]} monthly subscriber ridership')
    plt.xlabel('Month (1 - Jan,... 12 - Dec)')
    plt.xticks(range(1,13,1))
    
    #plot monthly customer ridership
    plt.subplot(1, 3, 2)
    plt.bar(cus_x,cus_y)
    plt.title(f'{cities[i]} monthly customer ridership')
    plt.xlabel('Month (1 - Jan,... 12 - Dec)')
    plt.xticks(range(1,13,1))
    
    plt.subplot(1, 3, 3)
    plt.bar(cus_x,subscriber_customer_ratio)
    plt.title(f'{cities[i]} monthly subscriber to customer ratio')
    plt.xlabel('Month (1 - Jan,... 12 - Dec)')
    plt.xticks(range(1,13,1))
    plt.show()

**Question 7**: Putting the bike share data aside, think of a topic or field of interest where you would like to be able to apply the techniques of data science. What would you like to be able to learn from your chosen subject?

**Answer**: 

I would like to be able to apply data science to the field of environmental studies. While there is much existing research on global topics such as global warming, Carbon Dioxide levels, etc., I believe that there is a lack of analysis on more local topics which could potentially spread awareness within local communities and incentivize them to take action. For example, many people can feel worried about global warming, but as it does not only impact their local community, i.e. the whole world is suffering from it, a bystander effect occurs wherein individuals and local communities remain passive and unmotivated towards initiatives in protecting the environment. 

If data science was used to estimate the level of pollution and predict the impact of sustainability efforts at the local level, it would serve as a bigger wake-up call to individuals regarding the damage that their local communities are causing, and thus make them feel greater ownership and responsibility regarding initiatives to protect the environment. It would also provide more conrete goals that can be achieved in terms of sustainability efforts.

Data science should be used to investigate quantitative goals that can be achieved by the local community to hold people responsible for their environmental actions, and to make progress towards an observable metric.

**Question 6**: Continue the investigation by exploring another question that could be answered by the data available. Document the question you want to explore below. Your investigation should involve at least two variables and should compare at least two groups. You should also use at least one visualization as part of your explorations.

**Answer**: 

<b>Exploratory question: </b>How does ridership differ by month or season? Which month / season has the highest ridership? Does the ratio of Subscriber trips to Customer trips change depending on the month or season?

<b>Analysis:</b> From the graphs below, we see that the monthly ridership distribution for subscribers and customers, and the monthly subcriber to customer ridership ratio is roughly similar between all 3 states. As such, the analysis will be carried out in a generalized manner assumed to be sufficient for providing a high-level analysis of monthly riderships.

The <b>monthly subscriber ridership</b> appears to peak around June to October, with relatively balanced tails on either side. The lowest subscriber ridership count is around December to February. The tails are rather heavy, which indicates that the difference in ridership between months is not too stark for subscribers.

The <b>monthly customer ridership</b> appears to peak around July to September, with relatively balanced tails on either side. The lowest subscriber ridership count is around November to February. The tails are much lighter compared to the subscriber ridership distribution, which indicates that the difference in ridership between months is greater for customers vs subscribers.

The <b>monthly subscriber to customer ridership ratio</b> peaks at December to February, and hits relatively low points throughout March to September, with the lowest points being around June to August. This indicates that the number of susbcribers using the bike-sharing service around December to February are much higher compared to the number of customers, whereas the number of subscribers and customers using the bike-sharing service around March to September are relatively similar.

<b>Recommendations:</b> From this analysis, we suggest that bike-sharing companies increase the number of bikes available around June to October to account for the higher number of riders. Bikes can be brought in for maintanence during December to February since ridership is relatively low then. As it appears that a relatively low amount of customers use the bike-sharing service from December to February, further research can be conducted to investigate the reasons for low customer ridership during these months, and high subscriber ridership during these months. If there is a correlation between low customer usage and high subscriber usage, this could potentially indicate that there are not enough bikes available to fit the cumulative subscriber and customer demand during the peak months.