### The 10 Most Popular Citi Bike Stations

In [17]:
# Create CSV with 10 most popular stations in each data file
import csv
from csv import reader
from csv import writer
from collections import Counter
import glob
import collections
import re

path="/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CitiBike_Data/"

most_common_all=[]

for file in glob.glob(path+"*.csv"):
    
    path_i=file
    
    csv_r=reader(open(path_i, "r"), delimiter=",")
    
    num=re.findall(r'\b\d+\b', file)
    
    stations=[]
    
    for row in csv_r:
        stations.append((row[3], row[4], row[5], row[6], num[0], num[1]))
    
    counter=collections.Counter(stations)
    
    most_common_nested=counter.most_common(10)
    
    most_common=[(a,b,c,d,e,f, g) for ((a, b, c, d, e, f),g) in most_common_nested]
    
    most_common_all.append(most_common)

    with open("/Users/hildavargas/Desktop/Springboard_Mini_Projects/Capstone_Project/CB_real_metrics/10MostPopularStations.csv", "w") as out:
        csv_o=csv.writer(out, delimiter=',')
        
        fieldnames=['start station id','start station name', 'start station latitude', 'start station longitude', 'year', 'month', 'number of rides']
        
        csv_o.writerow(fieldnames)
    
        for row in most_common_all:
            csv_o.writerows(row)

In [1]:
# Create dataframe from copy of CSV created above that has been upload

import pandas as pd
from boto.s3.connection import S3Connection
from io import BytesIO

conn = S3Connection('', '', anon=True)
bucket = conn.get_bucket('hildavarcapstone')
key= bucket.get_key('10MostPopularStations.csv')
blob=BytesIO(key.get_contents_as_string())
mostcommondf=pd.read_csv(blob)
mostcommondf.head()

mostcommondf[['start station id']] = mostcommondf[['start station id']].apply(pd.to_numeric)
mostcommondf[['year']] = mostcommondf[['year']].apply(pd.to_numeric)
mostcommondf[['month']] = mostcommondf[['month']].apply(pd.to_numeric)
mostcommondf[['number of rides']] = mostcommondf[['number of rides']].apply(pd.to_numeric)
mostcommondf.head()

Unnamed: 0,start station id,start station name,start station latitude,start station longitude,year,month,number of rides
0,459,W 20 St & 11 Ave,40.746745,-74.007756,2013,7,8049
1,497,E 17 St & Broadway,40.73705,-73.990093,2013,7,7814
2,426,West St & Chambers St,40.717548,-74.013221,2013,7,7387
3,519,Pershing Square N,40.751884,-73.977702,2013,7,7290
4,285,Broadway & E 14 St,40.734546,-73.990741,2013,7,7167


#### Data Cleaning: Handling duplicate stations

There are some stations that are the same, but are identified as unique because their coordinates slightly changed 
over time. If I keep them as they are, two markers will show for one station. Create a new dataframe for each 
unique station (identified by its start station id), and keep only its most recent coordinates and name. Convert this 
dataframe to dictionaries for each column and replace the current values in the original dataframe.

In [2]:
from collections import defaultdict

coord_df=mostcommondf.copy()
coord_df=coord_df.drop(coord_df.columns[[6]], axis=1)
coord_df=coord_df.sort_values(['start station id', 'year'])
coord_df[['start station id']] = coord_df[['start station id']].apply(pd.to_numeric)
coord_df=coord_df.drop_duplicates('start station id', keep='last')

# Replace start station name

name_dict={}
for row in coord_df.itertuples():
    name_dict[row[1]]=row[2]
mostcommondf['start station name']=mostcommondf['start station id'].map(name_dict)

# Replace start station latitude

lat_dict={}
for row in coord_df.itertuples():
    lat_dict[row[1]]=row[3]
mostcommondf['start station latitude']=mostcommondf['start station id'].map(lat_dict)

# Replace start station longitude

long_dict={}
for row in coord_df.itertuples():
    long_dict[row[1]]=row[4]
mostcommondf['start station longitude']=mostcommondf['start station id'].map(long_dict)

mostcommondf.head()

Unnamed: 0,start station id,start station name,start station latitude,start station longitude,year,month,number of rides
0,459,W 20 St & 11 Ave,40.746745,-74.007756,2013,7,8049
1,497,E 17 St & Broadway,40.73705,-73.990093,2013,7,7814
2,426,West St & Chambers St,40.717548,-74.013221,2013,7,7387
3,519,Pershing Square North,40.751873,-73.977706,2013,7,7290
4,285,Broadway & E 14 St,40.734546,-73.990741,2013,7,7167


#### Create an interactive map for the 10 most popular start stations with Folium

In [6]:
from ipywidgets import interact, Layout
from ipywidgets.widgets import SelectMultiple
import folium
from IPython import display

# Create dictionary for new column in dataframe with the actual month name
months_name={}
months_name[1]='January'
months_name[2]="February"
months_name[3]= "March"
months_name[4]="April"
months_name[5]="May"
months_name[6]="June"
months_name[7]="July"
months_name[8]="August"
months_name[9]="September"
months_name[10]="October"
months_name[11]="November"
months_name[12]="December"

df=mostcommondf.copy()
df['Month Name']=df["month"].map(months_name)

# Create lists of unique values for months and years for widget
months=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
years=sorted(df['year'].unique().tolist())

# Define function for interactive widget
def make_map(Months, Years):
    x=list(Months)
    z=list(Years)
    
    # If user chooses more than one month or year, group by the start station id, name, latitude, and longitude, sum the number of rides (descending)
    # and plot the first 10 stations 
    
    if len(x)>1 or len(z)>1: 
        gooddata=df.loc[(df['Month Name'].isin(x)) & df['year'].isin(z)].sort_values('number of rides', ascending=False)
        gooddata=gooddata.groupby(['start station id', 'start station name', 'start station latitude', 'start station longitude']).sum().reset_index()
        gooddata=gooddata.sort_values('number of rides', ascending=False).head(10)
    
    else:
         gooddata=df.loc[(df['Month Name'].isin(x)) & df['year'].isin(z)]
            
    nyc_map=folium.Map(location=[40.756, -73.982], zoom_start=11.9, min_zoom=10, tiles='cartodbdark_matter')
    
    for row in gooddata.itertuples():
        folium.RegularPolygonMarker([row[3], row[4]], popup=row[2], radius=4).add_to(nyc_map)
    return nyc_map

interact(make_map, Months=SelectMultiple(options=months, value=['July']), Years=SelectMultiple(options=years, value=[2013]))

<function __main__.make_map>

In [2]:
print('tsk tsk')

tsk tsk
