In [1]:
### LESSON 3: NumpPy and Pandas for 2D Data ###

In [2]:
# Questions to answer using New York Subway Data:
#     1. How does temperature affect ridership?
#     2. How rain during the morning commute hours (8-10am) affect the number of commuters?
#     3. At what point does the weather really make a difference? Is there a temp or an amount of rainfall at
#       which we can expect ridership to increase or decrease?
#     4. What hours have the most riders? Least riders?
#     5. What stations are busiest? Slowest?

In [3]:
import numpy as np

In [16]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])


In [7]:
# Accessing elements (row, column)
print ridership[1, 3]
# 2328
print ridership[1:3, 3:5]
# [2328, 2539],
# [6461, 2691]
print ridership[1, :]
# [1478, 3877, 3674, 2328, 2539]

2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


In [10]:
# Vectorized operations on rows or columns
print ridership[0, :] + ridership[1, :]
# [1478 3877 3676 2333 2539]
print ridership[:, 0] + ridership[:, 1]
# [0 5355 5701 4952 6430 5509 324 2 5223 5385]

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [11]:
# Vectorized operations on entire arrays
a = np.array([[1,2,3], [4,5,6], [7,8,9]])
b = np.array([[1,1,1], [2,2,2], [3,3,3]])

print a + b
# [2,3,4], [6,7,8], [10,11,12]

[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [27]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    max_ridership_loc = ridership[0, :].argmax()
    overall_mean = ridership.mean()
    mean_for_max = ridership[:, max_ridership_loc].mean()
    
    return (overall_mean, mean_for_max)

In [28]:
mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9000000000001)

In [29]:
# Axis argument... axis=0, means calculate down the column;` axis=1, means calculate across the row

In [31]:
a = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])

print a.sum()
print a.sum(axis=0)
print a.sum(axis=1)

45
[12 15 18]
[ 6 15 24]


In [33]:
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [36]:
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_riderships = ridership.mean(axis=0)
    print mean_riderships
    max_daily_ridership = mean_riderships.max()
    min_daily_ridership = mean_riderships.min()
    
    return (max_daily_ridership, min_daily_ridership)

In [37]:
min_and_max_riders_per_day(ridership)

[ 1071.2  2814.9  2718.8  3239.9  1868.2]


(3239.9000000000001, 1071.2)

In [38]:
# Accessing elements:
# use .loc, .iloc to find within the dataframe. if you use .values it will return a 2D NumPy array.

In [39]:
import pandas as pd

In [40]:
# Create the data frame:
# data as a 2D array, index gives row names, by date for this frame, columns gives names to each ridership station.

ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

In [42]:
ridership_df
# ooo pretty 😍

Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [44]:
# DataFrame creation

# create out of a dict mapping column names to values
df_1 = pd.DataFrame({'A': [0,1,2], 'B': [3,4,5]})
print df_1

#create using a list of lists of a NumPy 2D array
df_2 = pd.DataFrame([[0,1,2], [3,4,5]], columns=['A','B','C'])
print df_2

# note that the rows are automatically given numbers (0,1,2, etc)

   A  B
0  0  3
1  1  4
2  2  5
   A  B  C
0  0  1  2
1  3  4  5


In [59]:
# Accessing elements

# use .iloc when locating by the given pandas index
print ridership_df.iloc[0]

print ridership_df.iloc[1,3]
# 2328  --- row first, then column (like alg x, y)

R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64
2328


In [51]:
# use .loc to locate by row name
print ridership_df.loc['05-05-11']

R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64


In [52]:
# use normal indexing brackets to locate columns by name
print ridership_df['R003']

05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64


In [61]:
# Access multiple rows (.iloc to access by row)

print ridership_df.iloc[1:4]

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613


In [63]:
# Access multiple columns (using usual indexing brackets)

print ridership_df[['R003','R005']]

          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


In [70]:
# Using Pandas axis

df = pd.DataFrame({'A': [0,1,2], 'B': [3,4,5]})
print df
print df.sum()
# sums each element (A, B)
print df.sum(axis=1)
# sums rows (0+3, 1+4, 2+5)
print df.values.sum()
# return a Numpy 2D array and sum all the elements= 15

   A  B
0  0  3
1  1  4
2  2  5
A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
15


In [86]:
def mean_riders_for_max_station_df(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    max_day = ridership.iloc[0].argmax()
    overall_mean = ridership.values.mean()
    mean_for_max = ridership[max_day].mean()
    
    return (overall_mean, mean_for_max)

In [87]:
mean_riders_for_max_station_df(ridership_df)

(2342.5999999999999, 3239.9)

In [88]:
# taking means:
# if you take ridership.mean(), it returns means for each individual row
# adding ridership.values.mean() will return the mean for the entire data set

In [89]:
# DataFrames are a great data structure to represent CSV's: 
# They provide a separate data type to each column

# read_csv = built in function that reads CSV's into Pandas DFs

In [91]:
subway_df = pd.read_csv('./nyc_subway_weather.csv')

In [92]:
subway_df.head()
# prints first 5 rows of the DF as a sampling of the info

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0.0,0.0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0.0,0.0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0.0,0.0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.0,30.258,55.98,7.86,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0.0,0.0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.0,30.258,55.98,7.86,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0.0,0.0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.0,30.258,55.98,7.86,40.700348,-73.887177


In [94]:
subway_df.describe()
# gives some stats about the df as a whole

Unnamed: 0,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,hour,day_week,weekday,latitude,longitude,fog,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
count,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,...,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0
mean,28124860.0,19869930.0,1886.589955,1361.487866,10.046754,2.905719,0.714436,40.724647,-73.940364,0.009824,...,29.971096,0.224741,63.10378,6.927872,0.004618,29.971096,63.10378,6.927872,40.728555,-73.938693
std,30436070.0,20289860.0,2952.385585,2183.845409,6.938928,2.079231,0.451688,0.07165,0.059713,0.098631,...,0.137942,0.417417,8.455597,4.510178,0.016344,0.131158,6.939011,3.179832,0.06542,0.059582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.576152,-74.073622,0.0,...,29.55,0.0,46.9,0.0,0.0,29.59,49.4,0.0,40.600204,-74.01487
25%,10397620.0,7613712.0,274.0,237.0,4.0,1.0,0.0,40.677107,-73.987342,0.0,...,29.89,0.0,57.0,4.6,0.0,29.913333,58.283333,4.816667,40.688591,-73.98513
50%,18183890.0,13316090.0,905.0,664.0,12.0,3.0,1.0,40.717241,-73.953459,0.0,...,29.96,0.0,61.0,6.9,0.0,29.958,60.95,6.166667,40.72057,-73.94915
75%,32630490.0,23937710.0,2255.0,1537.0,16.0,5.0,1.0,40.759123,-73.907733,0.0,...,30.06,0.0,69.1,9.2,0.0,30.06,67.466667,8.85,40.755226,-73.912033
max,235774600.0,149378200.0,32814.0,34828.0,20.0,6.0,1.0,40.889185,-73.755383,1.0,...,30.32,1.0,86.0,23.0,0.1575,30.293333,79.8,17.083333,40.862064,-73.694176


In [95]:
# Correlation Coefficient: Pearson's r

In [215]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: meanprecipi, dtype: float64


In [237]:
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    x_standard = (x - x.mean()) / x.std(ddof=0)
    y_standard = (y - y.mean()) / y.std(ddof=0)
    
    xy_standard_units = x_standard * y_standard
    correlation = xy_standard_units.mean()
    
    return correlation

In [239]:
print correlation(entries, rain)

0.0356485157722


In [240]:
print correlation(entries, temp)

-0.0266933483216


In [241]:
print correlation(rain, temp)

-0.229034323408


In [242]:
print correlation(entries, cum_entries)

0.585895470766


In [None]:
# Pandas Axis Names