### Modifying and merging data sets
One of the most powerful ways to use data sets is by merging them. This notebook goes over the following things:
- preparing data for merging by cleaning it
- concatenating data sets 
- merging data based on a common column

In [1]:
import pandas as pd

### Concatenation 
Concatenation is the process of combining data sets that all have the same column headers. Think of it as a way to combinging thousands of rows of data.

In [2]:
charges_01 = pd.read_csv('../data/SH Charge Receipts - 01.csv')
charges_02 = pd.read_csv('../data/SH Charge Receipts - 02.csv')
charges_03 = pd.read_csv('../data/SH Charge Receipts - 03.csv')

In [3]:
print(len(charges_01), len(charges_02), len(charges_03))

64999 64999 40024


In [4]:
charges_01.head()

Unnamed: 0,CHARGE_FILING_DATE,CP_SEX,CP_NATIONAL_ORIGIN,CP_DOB,HISPANIC_CP,CP_RACE_STRING,R_NAICS_CODE,R_NAICS_DESCRIPTION,R_NUMBER_OF_EMPLOYEES,R_TYPE
0,Oct/01/1995,Female,Other National Origin - Obsolete,Oct/21/1969,,B,311612.0,Meat Processed from Carcasses,201 - 500 Employees,Private Employer
1,Oct/02/1995,Female,Other National Origin - Obsolete,Jan/01/2001,,O,541990.0,"All Other Professional, Scientific, and Techni...",15 - 100 Employees,Private Employer
2,Oct/02/1995,Female,Other National Origin - Obsolete,Jul/30/1960,,W,722110.0,Full-Service Restaurants,15 - 100 Employees,Private Employer
3,Oct/02/1995,Male,Other National Origin - Obsolete,Jun/02/1957,,B,422990.0,Other Miscellaneous Nondurable Goods Wholesalers,15 - 100 Employees,Private Employer
4,Oct/02/1995,Female,Other National Origin - Obsolete,Apr/15/1959,,W,523999.0,Miscellaneous Financial Investment Activities,501+ Employees,Private Employer


In [5]:
all_charges = pd.concat([ charges_01, charges_02, charges_03])
print(len(all_charges))

170022


In [6]:
all_charges.head()

Unnamed: 0,CHARGE_FILING_DATE,CP_SEX,CP_NATIONAL_ORIGIN,CP_DOB,HISPANIC_CP,CP_RACE_STRING,R_NAICS_CODE,R_NAICS_DESCRIPTION,R_NUMBER_OF_EMPLOYEES,R_TYPE
0,Oct/01/1995,Female,Other National Origin - Obsolete,Oct/21/1969,,B,311612.0,Meat Processed from Carcasses,201 - 500 Employees,Private Employer
1,Oct/02/1995,Female,Other National Origin - Obsolete,Jan/01/2001,,O,541990.0,"All Other Professional, Scientific, and Techni...",15 - 100 Employees,Private Employer
2,Oct/02/1995,Female,Other National Origin - Obsolete,Jul/30/1960,,W,722110.0,Full-Service Restaurants,15 - 100 Employees,Private Employer
3,Oct/02/1995,Male,Other National Origin - Obsolete,Jun/02/1957,,B,422990.0,Other Miscellaneous Nondurable Goods Wholesalers,15 - 100 Employees,Private Employer
4,Oct/02/1995,Female,Other National Origin - Obsolete,Apr/15/1959,,W,523999.0,Miscellaneous Financial Investment Activities,501+ Employees,Private Employer


In [7]:
all_charges.dtypes

CHARGE_FILING_DATE        object
CP_SEX                    object
CP_NATIONAL_ORIGIN        object
CP_DOB                    object
HISPANIC_CP               object
CP_RACE_STRING            object
R_NAICS_CODE             float64
R_NAICS_DESCRIPTION       object
R_NUMBER_OF_EMPLOYEES     object
R_TYPE                    object
dtype: object

### Finding values that can facilitate merging
- load the data set you want to merge with your other data set
- modify your original data set to make sure you have a common column to merge on
- merge your data sets!

In [8]:
# Read csv of economic data
economic_data = pd.read_csv('../data/bls_sector_metrics.csv')
economic_data.head()

Unnamed: 0,naics_sector,naics_sector_rollup,naics_supersector,naics_sector_name,avg_hrly_earnings,total_employment,women_percentage,source
0,11,11,,"Agriculture, Forestry, Fishing and Hunting",14.33,416.6,0.224,"oes,cps"
1,21,21,10.0,"Mining, Quarrying, and Oil and Gas Extraction",32.95,610.2,0.151754,ces
2,22,22,40.0,Utilities,38.35,555.9,0.232776,ces
3,23,23,20.0,Construction,25.3,6687.38,0.125019,"oes,ces_supersector"
4,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"


In [9]:
len(economic_data)

24

In [10]:
economic_data.dtypes

naics_sector             int64
naics_sector_rollup      int64
naics_supersector      float64
naics_sector_name       object
avg_hrly_earnings      float64
total_employment       float64
women_percentage       float64
source                  object
dtype: object

In [11]:
economic_data['naics_sector'] = economic_data['naics_sector'].astype(str)

In [12]:
economic_data.dtypes

naics_sector            object
naics_sector_rollup      int64
naics_supersector      float64
naics_sector_name       object
avg_hrly_earnings      float64
total_employment       float64
women_percentage       float64
source                  object
dtype: object

In [13]:
all_charges['R_NAICS_CODE'] = all_charges['R_NAICS_CODE'].astype(str)

In [14]:
all_charges.dtypes

CHARGE_FILING_DATE       object
CP_SEX                   object
CP_NATIONAL_ORIGIN       object
CP_DOB                   object
HISPANIC_CP              object
CP_RACE_STRING           object
R_NAICS_CODE             object
R_NAICS_DESCRIPTION      object
R_NUMBER_OF_EMPLOYEES    object
R_TYPE                   object
dtype: object

In [15]:
all_charges['naics_sector'] = all_charges['R_NAICS_CODE'].apply(lambda x: x[:2])

In [16]:
all_charges.head()

Unnamed: 0,CHARGE_FILING_DATE,CP_SEX,CP_NATIONAL_ORIGIN,CP_DOB,HISPANIC_CP,CP_RACE_STRING,R_NAICS_CODE,R_NAICS_DESCRIPTION,R_NUMBER_OF_EMPLOYEES,R_TYPE,naics_sector
0,Oct/01/1995,Female,Other National Origin - Obsolete,Oct/21/1969,,B,311612.0,Meat Processed from Carcasses,201 - 500 Employees,Private Employer,31
1,Oct/02/1995,Female,Other National Origin - Obsolete,Jan/01/2001,,O,541990.0,"All Other Professional, Scientific, and Techni...",15 - 100 Employees,Private Employer,54
2,Oct/02/1995,Female,Other National Origin - Obsolete,Jul/30/1960,,W,722110.0,Full-Service Restaurants,15 - 100 Employees,Private Employer,72
3,Oct/02/1995,Male,Other National Origin - Obsolete,Jun/02/1957,,B,422990.0,Other Miscellaneous Nondurable Goods Wholesalers,15 - 100 Employees,Private Employer,42
4,Oct/02/1995,Female,Other National Origin - Obsolete,Apr/15/1959,,W,523999.0,Miscellaneous Financial Investment Activities,501+ Employees,Private Employer,52


In [17]:
all_charges_economic_data = pd.merge(
    all_charges,
    economic_data,
    on='naics_sector',
    how = 'inner'

)

In [18]:
all_charges_economic_data.head()

Unnamed: 0,CHARGE_FILING_DATE,CP_SEX,CP_NATIONAL_ORIGIN,CP_DOB,HISPANIC_CP,CP_RACE_STRING,R_NAICS_CODE,R_NAICS_DESCRIPTION,R_NUMBER_OF_EMPLOYEES,R_TYPE,naics_sector,naics_sector_rollup,naics_supersector,naics_sector_name,avg_hrly_earnings,total_employment,women_percentage,source
0,Oct/01/1995,Female,Other National Origin - Obsolete,Oct/21/1969,,B,311612.0,Meat Processed from Carcasses,201 - 500 Employees,Private Employer,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"
1,Oct/02/1995,Female,Other National Origin - Obsolete,Dec/20/1946,,W,314999.0,All Other Miscellaneous Textile Product Mills,15 - 100 Employees,Private Employer,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"
2,Oct/06/1995,Female,Other National Origin - Obsolete,Aug/31/1974,,B,311612.0,Meat Processed from Carcasses,501+ Employees,Private Employer,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"
3,Oct/11/1995,Female,Hispanic - Obsolete,Aug/08/1963,,W,311999.0,All Other Miscellaneous Food Manufacturing,15 - 100 Employees,Private Employer,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"
4,Oct/16/1995,Female,Other National Origin - Obsolete,Jan/01/1901,,O,316219.0,Other Footwear Manufacturing,501+ Employees,Private Employer,31,31,30.0,Manufacturing,24.39,12337.52,0.274534,"oes,ces_supersector"
