In [4]:
import numpy as np
import pandas 
import statsmodels.api as sm
import datetime as dt
from ggplot import *
import matplotlib.pyplot as plt

In [5]:
#create dataframe
turnstile_weather = pandas.read_csv('~/Documents/ds_nanodegree/intro_to_ds_downloads/turnstile_weather_v2.csv')

#extract day from date
turnstile_weather['DT'] = pandas.to_datetime(turnstile_weather['datetime'])
turnstile_weather['day'] = turnstile_weather['DT'].dt.day

#flag the long weekends - in this case, Memorial Day
turnstile_weather['longWE'] = np.where(turnstile_weather['day'].isin([28,29,30]), 1, 0)

#top 10 stations for highest avg hourly entries


outliers=turnstile_weather[turnstile_weather.ENTRIESn_hourly > 30000]
outliers['rain']

1230     1
1306     1
1355     0
2529     1
2565     0
2571     0
2577     0
2601     1
2613     1
2619     1
2643     0
2649     0
2655     0
2893     0
2911     0
2947     0
2995     1
3025     0
11586    1
11622    0
11634    0
Name: rain, dtype: int64

In [None]:
# outliers.ENTRIESn.describe(0)

In [16]:
rainy = turnstile_weather[turnstile_weather.rain==1]
non_rainy = turnstile_weather[turnstile_weather.rain==0]
n_rainy = len(rainy)
n_non_rainy = len(non_rainy)
print 'n for rain: ' + str(n_rainy)
print 'n for non-rain: ' + str(n_non_rainy)
print np.median(rainy['ENTRIESn_hourly'])
print np.median(non_rainy['ENTRIESn_hourly'])

n for rain: 9585
n for non-rain: 33064
939.0
893.0


In [5]:
#rainy vs non-rainy histogram

#set font sizes
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 10}

plt.rc('font', **font)

#matplotlib
plt.figure()
turnstile_weather[turnstile_weather.rain==0].ENTRIESn_hourly.hist(bins=70, label='Non-rainy', color='coral')
turnstile_weather[turnstile_weather.rain==1].ENTRIESn_hourly.hist(bins=70, label = 'Rainy', color='lightskyblue', alpha=0.8)
plt.xlabel('ENTRIESn_hourly') 
plt.ylabel('Frequency')
plt.title('Hourly Entries on Rainy & Non-Rainy Days')
plt.legend(loc='upper right')
plt.show()


#ggplot historgram: there's a bug with binwidth
'''p = ggplot(turnstile_weather, aes(x='ENTRIESn_hourly', fill='rain'))
rainHist = p + geom_histogram() + \
ggtitle("Distribution of Hourly Entries for Rainy & Non-rainy Days") + \
xlab("Hourly Entries") + ylab("Frequency")

rainHist'''

'p = ggplot(turnstile_weather, aes(x=\'ENTRIESn_hourly\', fill=\'rain\'))\nrainHist = p + geom_histogram() + ggtitle("Distribution of Hourly Entries for Rainy & Non-rainy Days") + xlab("Hourly Entries") + ylab("Frequency")\n\nrainHist'

In [91]:
#getting all fancy with scatterplots

#rain
scatter_rain = ggplot(turnstile_weather, aes(x='precipi', y='ENTRIESn_hourly', color='precipi')) +\
    geom_point() + ylim(0,35000) +\
    scale_colour_gradient2(low="lightblue", high="darkblue") +\
    xlab("Precipitation (inches)") + ylab("Hourly Entries") + ggtitle("Does Rain impact Entries?")

scatter_rain

#wind

#scatter_wind = ggplot(turnstile_weather, aes(x='wspdi', y='ENTRIESn_hourly', color='wspdi')) +\
    #geom_point() + ylim(0,35000) +\
    #scale_colour_gradient2(low="lightpink", high="crimson") +\
    #xlab("Wind speed") + ylab("Hourly Entries") + ggtitle("Does Wind Speed impact Entries?")


#scatter_wind

<ggplot: (320070761)>

In [None]:
scatter_tempi = ggplot(turnstile_weather, aes(x='tempi', y='ENTRIESn_hourly', color='tempi')) +\
    geom_point() + ylim(0,35000) +\
    scale_colour_gradient2(low="lightblue", high="coral")  +\
    xlab("Temp (F)") + ylab("Hourly Entries") + ggtitle("Does Temperature impact Entries?")

scatter_tempi

In [None]:
#Visualize weekly traffic pattern
newdates = pandas.to_datetime(turnstile_weather['DATEn'], format="%m-%d-%y")
turnstile_weather['newdates'] = newdates
turnstile_weather['weekday'] = turnstile_weather['newdates'].dt.dayofweek


#for abbreviated days of week on x-axis
label_list = ['' , 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
   

#average hourly entries by day of week
#by_weekday = turnstile_weather.groupby('weekday', as_index = False).mean()
wdRain = turnstile_weather[turnstile_weather['rain']==1].groupby('day_week', as_index = False).mean()
wdNoRain = turnstile_weather[turnstile_weather['rain']==0].groupby('day_week', as_index = False).mean()

plot = ggplot(wdRain, aes('day_week', 'ENTRIESn_hourly', width=.75)) + \
    geom_bar(stat='identity', fill='steelblue', alpha=0.5) + xlim(-.5,6.5) + ylim(0,3000) +\
    ggtitle("Average Hourly Entries by Weekday") + scale_x_discrete(labels=label_list) + \
    xlab("Day of Week: blue bars = rain; black dot = no rain") + ylab("Average hourly entries") +\
    geom_point(wdNoRain)

plot


In [88]:
#average hourly entries by hour

hourRain = turnstile_weather[turnstile_weather['rain']==1].groupby('hour', as_index = False).mean()
hourNoRain = turnstile_weather[turnstile_weather['rain']==0].groupby('hour', as_index = False).mean()

hplot = ggplot(hourRain, aes('hour', 'ENTRIESn_hourly', width=.5)) + \
    geom_bar(stat='identity', fill='steelblue', alpha=0.5) +\
    ggtitle("Average Hourly Entries by Time of Day") + xlim(-.5,24.5) + ylim(0,4000) +\
    xlab("Hour of Day: blue bars = rain; black dot = no rain") + ylab("Average hourly entries") +\
    geom_point(hourNoRain)

hplot



<ggplot: (325084105)>

In [18]:
#combine on one plot?
label_list2 = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', '']

wdplot = ggplot(wdRain, aes('weekday', 'ENTRIESn_hourly')) + xlim(0,7) + \
    geom_step(color='blue')  + scale_x_discrete(labels=label_list2) #+\
    #geom_step(wdNoRain, aes('weekday', 'ENTRIESn_hourly')) 
wdplot

<ggplot: (280944261)>

In [None]:
box = turnstile_weather.boxplot(column = 'ENTRIESn_hourly', by = 'rain', return_type='dict', grid=True)
plt.ylim(-5000,10000)
#plt.ylim(-5000,35000)
plt.show(box)

In [17]:
turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain==0].describe()

count    33064.000000
mean      1845.539439
std       2878.770848
min          0.000000
25%        269.000000
50%        893.000000
75%       2197.000000
max      32814.000000
Name: ENTRIESn_hourly, dtype: float64

In [10]:
turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain==1].describe()

count     9585.000000
mean      2028.196035
std       3189.433373
min          0.000000
25%        295.000000
50%        939.000000
75%       2424.000000
max      32289.000000
Name: ENTRIESn_hourly, dtype: float64