In [2]:
import pandas as pd
import glob


# Load CSV

We'll start our journey by loading CSV (Comma Separated Values) Files. 

These might look like this:
```
Title, Date, Sales
The Matrix, 2020-01-04, 44553
Contagion, 2020-04-14, 120000
Idiocracy, 2016-11-06, 303421
```


### Headers

CSV Files sometimes include headers or column names to make it easier to understand what data you're looking at.

We'll get started by reading Street Sweeping Route data from the City of Los Angeles.

https://data.lacity.org/A-Livable-and-Sustainable-City/Posted-Street-Sweeping-Routes/krk7-ayq2

In [36]:
street_df = pd.read_csv('street_sweeping.csv')

In [20]:
street_df

Unnamed: 0,Route No,Council District,Time Start,Time End,Boundaries
0,* 17P100,2,10:00 AM,12:00 PM,Chandler Bl. to Ventura Fwy / Colfax Av. to La...
1,*17P100,2,10:00 AM,12:00 PM,Chandler Bl. to Ventura Fwy / Colfax Av. to La...
2,10P136 Th,10,8:00 AM,10:00 AM,La Brea Ave to Arlington Ave- Santa Monica Fwy...
3,10P136 W,10,8:00 AM,10:00 AM,La Brea Ave to Arlington Ave -Santa Monica Fwy...
4,10P137 M,10,9:00 AM,11:00 AM,Adams Blvd to Jefferson Blvd - La Cienega Blvd...
...,...,...,...,...,...
859,9P345 Tu,5,12:00 PM,3:00 PM,"Washington - Venice, Fairfax - Hauser"
860,9P346 M,5,11:00 AM,2:00 PM,"Gutherie - Saturn, Fairfax, Crescent Heights"
861,9P346 Tu,5,11:00 AM,2:00 PM,"Gutherie - Saturn, Fairfax, Crescent Heights"
862,9P347 M,10,8:00 AM,10:00 AM,"Pico - Washington, Hauser - La Brea"


In [13]:
street_df.describe()

Unnamed: 0,Route No,Council District,Time Start,Time End,Boundaries
count,864,864,864,864,864
unique,864,33,12,15,456
top,17P407 Th,8,8:00 AM,10:00 AM,Rinaldi st. to San Fernando Mission bl./Balboa...
freq,1,119,286,242,4


In [15]:
street_df.columns

Index(['Route No', 'Council District', 'Time Start', 'Time End', 'Boundaries'], dtype='object')

In [18]:
street_df.loc[:,['Route No', 'Boundaries']]

Unnamed: 0,Route No,Boundaries
0,* 17P100,Chandler Bl. to Ventura Fwy / Colfax Av. to La...
1,*17P100,Chandler Bl. to Ventura Fwy / Colfax Av. to La...
2,10P136 Th,La Brea Ave to Arlington Ave- Santa Monica Fwy...
3,10P136 W,La Brea Ave to Arlington Ave -Santa Monica Fwy...
4,10P137 M,Adams Blvd to Jefferson Blvd - La Cienega Blvd...
...,...,...
859,9P345 Tu,"Washington - Venice, Fairfax - Hauser"
860,9P346 M,"Gutherie - Saturn, Fairfax, Crescent Heights"
861,9P346 Tu,"Gutherie - Saturn, Fairfax, Crescent Heights"
862,9P347 M,"Pico - Washington, Hauser - La Brea"


### No Headers
Now let's assume we received a file without a header.

For this we'll use an altered version of the Los Angeles Animal Services Intake data file.

https://data.lacity.org/A-Well-Run-City/Animal-Services-Intake-Data/8cmr-fbcu

In [37]:
animal_serv_df = pd.read_csv('animal_services.csv')

In [38]:
animal_serv_df.head()

Unnamed: 0,W VALLEY,A0041356,11/11/2011,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
0,N CENTRA,A0163185,08/07/2011,OWNER SUR,ALIVE,DOG,SPITZ,CHOW CHOW,
1,E VALLEY,A0163432,12/18/2011,OWNER SUR,DEAD,DOG,,TERRIER X,MIX
2,S LA,A0164458,01/06/2013,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
3,W LA,A0166070,09/03/2011,OWNER SUR,DEAD,DOG,SPITZ,AKITA,MIX
4,S LA,A0168340,01/12/2011,OWNER SUR,ALIVE,DOG,SHEPHERD,GERM SHEPHERD,


Well, that didn't go as we planned. We need to tell python that we don't have a header and set the column names ourselves.

In [2]:
col_names = ['Shelter','Animal ID#','Intake Date','Intake Type',
 'Intake Condition','Animal Type','Group','Breed 1','Breed 2']

animal_serv_df = pd.read_csv('animal_services.csv', header = None, names = col_names)

In [7]:
animal_serv_df.head()

Unnamed: 0,Shelter X,Animal ID#,Intake Date,Intake Type,Intake Condition,Animal Type,Group,Breed 1,Breed 2
0,W VALLEY,A0041356,11/11/2011,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
1,N CENTRA,A0163185,08/07/2011,OWNER SUR,ALIVE,DOG,SPITZ,CHOW CHOW,
2,E VALLEY,A0163432,12/18/2011,OWNER SUR,DEAD,DOG,,TERRIER X,MIX
3,S LA,A0164458,01/06/2013,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
4,W LA,A0166070,09/03/2011,OWNER SUR,DEAD,DOG,SPITZ,AKITA,MIX


#### Changing Column Names
We can also change column names after reading in a csv.

In [8]:
new_col_names = ['shelter_area','animal_id','intake_date','intake_type',
 'intake_condition','animal_type','animal_group','breed_info','breed_info_aux']

animal_serv_df.columns = new_col_names

animal_serv_df.head()

Unnamed: 0,shelter_area,animal_id,intake_date,intake_type,intake_condition,animal_type,animal_group,breed_info,breed_info_aux
0,W VALLEY,A0041356,11/11/2011,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
1,N CENTRA,A0163185,08/07/2011,OWNER SUR,ALIVE,DOG,SPITZ,CHOW CHOW,
2,E VALLEY,A0163432,12/18/2011,OWNER SUR,DEAD,DOG,,TERRIER X,MIX
3,S LA,A0164458,01/06/2013,OWNER SUR,DEAD,DOG,MASTIFF,AMERICAN STAFF,MIX
4,W LA,A0166070,09/03/2011,OWNER SUR,DEAD,DOG,SPITZ,AKITA,MIX


### Non-Comma Separated

Even though they are called 'Comma Separated', the fields may be separated by other values as well. 

Now let's read a csv that doesn't commas. We'll first attempt to read it without changing anything and see what happens. 

In [9]:
parking_df = pd.read_csv('parking_meter.csv')
parking_df.head()

Unnamed: 0,SpaceID|EventTime_UTC|OccupancyState
0,CB4405|05/19/2019 07:04:53 PM|VACANT
1,SV113|10/05/2019 03:56:34 PM|UNKNOWN
2,CC967C|02/12/2019 05:05:58 PM|OCCUPIED
3,CB479|10/28/2019 03:29:34 PM|OCCUPIED
4,CB4221|11/19/2019 06:18:52 PM|VACANT


As you can see, python doesn't know how to split up the data into columns. It just assumed that each line was a column by itself. 
Let's try again by specifying a *delimiter*.

In [11]:
parking_df = pd.read_csv('parking_meter.csv', delimiter = '|')
parking_df.head()

Unnamed: 0,SpaceID,EventTime_UTC,OccupancyState
0,CB4405,05/19/2019 07:04:53 PM,VACANT
1,SV113,10/05/2019 03:56:34 PM,UNKNOWN
2,CC967C,02/12/2019 05:05:58 PM,OCCUPIED
3,CB479,10/28/2019 03:29:34 PM,OCCUPIED
4,CB4221,11/19/2019 06:18:52 PM,VACANT


### Handling Blanks

In [9]:
parking_missing_df = pd.read_csv('parking_meter_missing.csv', delimiter = '|')
parking_missing_df

Unnamed: 0,SpaceID,EventTime_UTC,OccupancyState
0,CB4405,05/19/2019 07:04:53 PM,VACANT
1,SV113,10/05/2019 03:56:34 PM,UNKNOWN
2,CC967C,02/12/2019 05:05:58 PM,OCCUPIED
3,CB479,10/28/2019 03:29:34 PM,OCCUPIED
4,CB4221,11/19/2019 06:18:52 PM,VACANT
5,CB462,11/18/2019 04:00:40 PM,OCCUPIED
6,CB2050,11/04/2019 04:42:02 PM,OCCUPIED
7,CB4142,,OCCUPIED
8,CB1985,,VACANT
9,SV61,05/11/2019 12:02:32 PM,UNKNOWN


### Marking Data as Na/Nan

Let's mark the 'UNKNOWN' values as NaN.

In [13]:
parking_missing_df = pd.read_csv('parking_meter_missing.csv', delimiter = '|', na_values = 'UNKNOWN')
parking_missing_df.head()

Unnamed: 0,SpaceID,EventTime_UTC,OccupancyState
0,CB4405,05/19/2019 07:04:53 PM,VACANT
1,SV113,10/05/2019 03:56:34 PM,
2,CC967C,02/12/2019 05:05:58 PM,OCCUPIED
3,CB479,10/28/2019 03:29:34 PM,OCCUPIED
4,CB4221,11/19/2019 06:18:52 PM,VACANT


# Load Excel

We can load Excel files in the same way. Excel workbooks can have multiple sheets, which we will have to deal with.

### Single Sheet
We'll start by reading an excel workbook with a single sheet.

In [17]:
lax_parking_df = pd.read_excel('lax_parking.xlsx', sheet_name = 'lax_parking')
lax_parking_df.head()

Unnamed: 0,Key_Value,LotDescription,ParkingID,ParkingName,TotalParkingSpaces,Occupied,FreeSpaces,FullCapacity,Color,DataExportDateTime,Long,Lat
0,c2001,LOT 2B Occupancy Input,P2B,P-2B,490,239,251,49,Green,2020-06-17 19:20:01,-118.405227,33.944422
1,c3000,LOT 3 Occupancy Input,P3,P-3,905,163,742,18,Green,2020-06-17 19:20:01,-118.407223,33.944306
2,c5000,Lot 5 Occupancy,P5,P-5,673,177,496,26,Green,2020-06-17 19:22:01,-118.405185,33.943273
3,c6000,Lot 6 Occupancy,P6,P-6,899,368,531,41,Green,2020-06-17 19:22:01,-118.403672,33.943255
4,c7000,Lot 7 Occupancy Input,P7,P-7,1705,391,1314,23,Green,2020-06-17 19:20:01,-118.399853,33.943851


### Multisheet Workbook

Now we'll work with a workbook with multiple sheets.

Let's see what sheets are in this workbook.

In [21]:
xlsx = pd.ExcelFile('events_and_streets.xlsx')
print(xlsx.sheet_names)

['street_names', 'city_events']


#### Load a Single Sheet
First we'll read in a single sheet.



In [25]:
city_events_df = pd.read_excel(xlsx, sheet_name = 'city_events')
city_events_df.head()

Unnamed: 0,Id,Title,Event Start Date,Event End Date,Event type,Event Department,Information Website,Event Location Name,Event Location,Event Reference URL,...,Contact Phone,Contact E-mail,Description,tags,Fee Required,Event Cost,Event Main Image,Ages,Event Audience,Event File
0,303256,Board of Public Works Agenda,2018-04-20 10:00:00,2018-04-20 10:00:00,City Government,Board of Public Works,http://ens.lacity.org/bpw/agendas/bpwagendas86...,,,http://calendar.lacity.org/event/board-public-...,...,,,,,0.0,,,,,
1,281301,Board of Public Works Agenda,2018-03-07 10:00:00,2018-03-07 10:00:00,City Government,Board of Public Works,http://ens.lacity.org/bpw/agendas/bpwagendas86...,,,http://calendar.lacity.org/event/board-public-...,...,,,,,0.0,,,,,
2,358141,Office of Community Beautification - LAFH Comm...,2018-08-17 10:00:00,2018-08-17 11:00:00,Culture & Community,Office of Community Beautification,OCB (http://dpw.lacity.org/office-community-be...,LA Family Housing,"7639 7653 Day St\nLos Angeles, CA 91042\n(34.2...",https://calendar.lacity.org/event/office-commu...,...,213-978-0227,leslie.shim@lacity.org,,,0.0,,,All,,
3,343491,Board of Public Works Agenda,2018-07-16 10:00:00,2018-07-16 10:00:00,City Government,Board of Public Works,http://ens.lacity.org/bpw/agendas/bpwagendas86...,,,http://calendar.lacity.org/event/board-public-...,...,,,,,0.0,,,,,
4,239091,44th Annual Dia De Los Muertos Celebration,2017-11-04 17:00:00,2017-11-04 23:00:00,Arts,Cultural Affairs,http://culturela.org/event/self-help-graphics-...,Felicitas and Gonzalo Mendez High School,"1200 Plaza\nDel Sol Los Angeles, CA 90033",https://calendar.lacity.org/event/44th-annual-...,...,323-881-6444,INFO@SELFHELPGRAPHICS.COM,<p>This year's Dia De Los Muertos celebration ...,,0.0,,https://calendar.lacity.org/sites/g/files/wph7...,"Adult, Young Adult",,


#### Load All Sheets - into separate dataframe in a dictionary

In [29]:
data = {}
with pd.ExcelFile('events_and_streets.xlsx') as xl:
    for sheet in xl.sheet_names:
        data[sheet] = pd.read_excel(xl, sheet_name = sheet)
        
data.keys()

dict_keys(['street_names', 'city_events'])

In [30]:
data['street_names'].head()

Unnamed: 0,Street ID,Street Name,Street Suffix,Street Suffix Direction,Official Street Name,Thomas Brothers Map,Street Type
0,2013,UNION,AVE,,UNION AVENUE,634D2,PUBLIC STREET NAMES
1,2523,SPRING,ST,,SPRING STREET,634H2,PUBLIC STREET NAMES
2,2221,TEXAS,AVE,,TEXAS AVENUE,631J5,PUBLIC STREET NAMES
3,8199,CARMONA,AVE,,CARMONA AVENUE,633C4,PUBLIC STREET NAMES
4,3268,RELIANCE,ST,,RELIANCE STREET,502D7,PUBLIC STREET NAMES


#### Load all sheets into the same dataframe

*You are given the annual data for Hexacorp with each quarter in a separate sheet. All columns are the same, you want to load them all into the same dataframe. You also want to add a new field called 'quarter'. (Luckily for us, the sheetnames are the quarter)*

In [38]:
data = {}
with pd.ExcelFile('hexacorp_2019.xlsx') as xl:
    for sheet in xl.sheet_names:
        data[sheet] = pd.read_excel(xl, sheet_name = sheet)
        data[sheet]['quarter'] = sheet

hex_2019_df = pd.concat(data)
hex_2019_df.shape

(365, 4)

In [39]:
hex_2019_df.head()

Unnamed: 0,Unnamed: 1,tx_date,tx_type,amount,quarter
Q1,0,2019-01-01,1,86.333333,Q1
Q1,1,2019-01-02,2,86.333333,Q1
Q1,2,2019-01-03,2,86.333333,Q1
Q1,3,2019-01-04,2,86.333333,Q1
Q1,4,2019-01-05,2,107.333333,Q1


In [40]:
hex_2019_df.tail()

Unnamed: 0,Unnamed: 1,tx_date,tx_type,amount,quarter
Q4,87,2019-12-27,1,1850.333333,Q4
Q4,88,2019-12-28,1,1871.333333,Q4
Q4,89,2019-12-29,1,1892.333333,Q4
Q4,90,2019-12-30,1,1892.333333,Q4
Q4,91,2019-12-31,1,1892.333333,Q4


### Load All the Things

Now we'll load all excel sheets in a given folder.

In [12]:
all_data = {}


for file in glob.glob("hexacorp*.xlsx"):
    print('Reading file: {file}'.format(file = file))
    with pd.ExcelFile(file) as xl:
        for sheet in xl.sheet_names:
            all_data[file+sheet] = pd.read_excel(xl, sheet_name = sheet)
            all_data[file+sheet]['source_file'] = file
            all_data[file+sheet]['quarter'] = sheet

hex_all_df = pd.concat(all_data, ignore_index=True)
hex_all_df.shape

Reading file: hexacorp_2017.xlsx
Reading file: hexacorp_2018.xlsx
Reading file: hexacorp_2019.xlsx


(1095, 5)

In [13]:
hex_all_df.head()

Unnamed: 0,tx_date,tx_type,amount,source_file,quarter
0,2017-01-01,1,86.333333,hexacorp_2017.xlsx,Q1
1,2017-01-02,2,86.333333,hexacorp_2017.xlsx,Q1
2,2017-01-03,2,86.333333,hexacorp_2017.xlsx,Q1
3,2017-01-04,2,86.333333,hexacorp_2017.xlsx,Q1
4,2017-01-05,2,107.333333,hexacorp_2017.xlsx,Q1


In [14]:
hex_all_df.tail()

Unnamed: 0,tx_date,tx_type,amount,source_file,quarter
1090,2019-12-27,1,1850.333333,hexacorp_2019.xlsx,Q4
1091,2019-12-28,1,1871.333333,hexacorp_2019.xlsx,Q4
1092,2019-12-29,1,1892.333333,hexacorp_2019.xlsx,Q4
1093,2019-12-30,1,1892.333333,hexacorp_2019.xlsx,Q4
1094,2019-12-31,1,1892.333333,hexacorp_2019.xlsx,Q4
