In [3]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
# This is being used to access a downloaded csv. 
# Import the required libraries.

import pandas as pd
import numpy as np

In [57]:
def colManip(colVal):
    """Used with apply in pandas dataframe to specifically split any value in the column that has
    an '=' in anywhere in the string
    
    Inputs: colVal(str)
    
    Return: colVal(str) when value is NaN
            colVal(dict) otherwise"""
    try:
        if np.isnan(colVal) == True:
                return colVal
    except:
        colVal = colVal.split('\n')
        colVal = {val.split('=')[0]: val.split('=')[1] if '=' in val else val for val in colVal}
        return colVal
    
def encoding(colVal):
    """Encodes column values with apply in pd.dataframe
    Inputs: colVal(str)
    Returns: None"""
    
    return None

In [58]:
# Importing the transportation survey from the San Francisco Municipality Transportation... 
# The data dictionary will be the only sheet imported to DataDictionary.
DataDictionary = pd.read_excel('~/SanFranciscoBikeShare/TDS_2017_Data_WEBPAGE.xlsx', sheet_name='Data Dictionary',
              engine='openpyxl', usecols=range(4))

# Use pd.concat to pick the dataframes that meet our criteria and join them together.
# This criterion was selected in excel. The original highlighted sheet is available in the data folder.
DataDictionary = pd.concat([DataDictionary[DataDictionary['Description'].str.contains('Bay Area') == True],
                            DataDictionary[DataDictionary['Field'].str.contains('BIKE') == True],
                            DataDictionary[DataDictionary['Description'].str.contains('Zip code') == True],
                            DataDictionary[DataDictionary['Description'].str.contains('Code') == True]])

# Changes the Codes type from string to dictionary that can be used in processing the survey data.
DataDictionary.Codes = DataDictionary.Codes.apply(colManip)

In [59]:
DataDictionary

Unnamed: 0,Field,Field Type,Description,Codes
1,Q2,Integer,In which Bay Area county do you live?,"{'1': 'Alameda', '2': 'Contra Costa', '3': 'Ma..."
3,Trips,Integer,Total trips taken- SF residents: Sum of Q4Tot+...,
54,BIKE,Integer,Total of all Bicycle (9) entries for Q4aM to Q5hM,
126,BIKE1,Integer,Total of all Bicycle (9) entries for Day 1 (Q7...,
138,BIKE2,Integer,Total of all Bicycle (9) entries for Day 2 (Q1...,
180,Zone,Integer,(San Francisco Residents) San Francisco Zone (...,"{'1': 'One', '2': 'Two', '3': 'Three', '4': 'F..."
165,Q25A-SF,Integer,(San Francisco Residents) What is your home ZI...,"{'1': '94102 ', '2': '94103 ', '3': '94104 ', ..."


In [18]:
# Using the Field column in the DataDictionary dataframe as a list, can help select the columns in the actual survey
inputList = DataDictionary.Field.str.replace('Zone','ZONE').to_list()

# Imports the Data page from the same excel workbook mentioned previous. This is the survey data shorted to the
# Scope of columns selected in the DataDictionary
Survey2017 = pd.read_excel('~/SanFranciscoBikeShare/TDS_2017_Data_WEBPAGE.xlsx', sheet_name='Data',
              engine='openpyxl', usecols=inputList)

# Preview of the survey before manipulation
Survey2017.head()

Unnamed: 0,Q2,Trips,BIKE,BIKE1,BIKE2,Q25A-SF,ZONE
0,1,19,,0.0,0.0,,
1,9,4,0.0,,,1.0,2.0
2,9,9,0.0,,,8.0,1.0
3,7,33,,0.0,0.0,,
4,9,7,0.0,,,2.0,1.0


In [61]:
Survey2017.describe()

Unnamed: 0,Q2,Trips,BIKE,BIKE1,BIKE2,Q25A-SF,ZONE
count,804.0,804.0,401.0,403.0,403.0,401.0,401.0
mean,6.284826,15.095771,0.122195,0.049628,0.007444,14.164589,2.947631
std,3.097377,26.272393,0.66523,0.996271,0.149441,7.814592,1.640046
min,1.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,3.0,4.0,0.0,0.0,0.0,8.0,2.0
50%,8.0,6.0,0.0,0.0,0.0,14.0,3.0
75%,9.0,11.0,0.0,0.0,0.0,19.0,4.0
max,9.0,330.0,8.0,20.0,3.0,30.0,9.0


In [63]:
# What is the length of this survey
len(Survey2017)

804

In [62]:
# How many people are in this survey from which county? Are there any people who are out of town?
Survey2017.groupby('Q2').count()

Unnamed: 0_level_0,Trips,BIKE,BIKE1,BIKE2,Q25A-SF,ZONE
Q2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,87,0,87,87,0,0
2,83,0,83,83,0,0
3,49,0,49,49,0,0
4,6,0,6,6,0,0
5,101,0,101,101,0,0
6,43,0,43,43,0,0
7,15,0,15,15,0,0
8,19,0,19,19,0,0
9,401,401,0,0,401,401


In [65]:
# How many people are in this survey from which county? Are there any people who are out of town?
Survey2017.groupby('Q25A-SF').count()

Unnamed: 0_level_0,Q2,Trips,BIKE,BIKE1,BIKE2,ZONE
Q25A-SF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,23,23,23,0,0,23
2.0,17,17,17,0,0,17
4.0,6,6,6,0,0,6
5.0,8,8,8,0,0,8
6.0,9,9,9,0,0,9
7.0,19,19,19,0,0,19
8.0,32,32,32,0,0,32
9.0,1,1,1,0,0,1
10.0,28,28,28,0,0,28
11.0,19,19,19,0,0,19
