# ETL with EEOC
# Khrizel Solano and Kelli Okuji Wilson

# Extract 

In [72]:
#pip install mysqlclient
import fnmatch
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [36]:
# Store CSV into DataFrame

In [37]:
csv_file = "./Resources/year16_nac2.csv"
eeoc_nac2_df = pd.read_csv(csv_file, sep=';').dropna(how='all')
eeoc_nac2_df.sample(5)
#eeoc_db.year16_state_nac2.csv

Unnamed: 0,NAC2_code,TOTAL_UNITS,TOTAL10,WHM1,WHM2,WHM3,WHM4,WHM5,WHM6,WHM7,...,TOMRF7,TOMRF8,TOMRF9,TOMRF1_2,NAC2_Label,i,SUMCOUNT,MISSCOUNT,SMALLEST,INDEX
22,81,5700,879176,11322,38777,17266,17515,15542,26124,21644,...,317,959.0,5847,708,Other Services (except Public Administration),141,0,0,,
3,23,9254,1602496,26632,83888,37250,21458,24350,393797,91062,...,88,196.0,52,263,Construction,141,0,0,,
17,56,13794,2957730,22985,107911,30442,39115,86451,56912,69100,...,1273,3211.0,7901,1515,Administrative and Support and Waste Managemen...,141,0,0,,
10,48,8711,1945593,12818,123420,15229,21793,37797,136907,344677,...,1946,793.0,2547,485,Transportation and Warehousing,141,0,0,,
11,49,3597,780628,2697,13061,2643,4782,10863,14902,120860,...,566,1365.0,29,333,Transportation and Warehousing,141,5,3,,


In [38]:
eeoc_kaggle_csv_file = "./Resources/Reveal_EEO1_for_2016.csv"
eeoc_kaggle_df = pd.read_csv(eeoc_kaggle_csv_file).dropna(how='all')
eeoc_kaggle_df.sample(5)
#Reveal_EEO1_for_2016.csv

Unnamed: 0,company,year,race,gender,job_category,count
4336,Sanmina,2016,American_Indian_Alaskan_Native,male,Administrative support,0
2529,MobileIron,2016,Hispanic_or_Latino,male,Service workers,1
1137,Facebook,2016,Native_Hawaiian_or_Pacific_Islander,male,Service workers,1
3748,Uber,2016,American_Indian_Alaskan_Native,female,Sales workers,0
243,Adobe,2016,Asian,male,Technicians,0


## Transform

* Transform the EEOC table from wide to narrow so it's easier to join with the Kaggle data set

* Create data dictionaries for race, gender, and job category in EEOC data set to match the Kaggle data set values

* Clean up the values in the job category in the Kaggle data set so it matches with the EEOC data set values

* Drop nulls and total/subtotal lines in Kaggle data set

* Create new tables by joining the EEOC and Kaggle data sets



## Write-up
Our final table does not include a join/merge of the two transformed datasets from Kaggle and the EEOC. After transforming the datasets, we encountered an obstacle in joining the datasets on a common identifer. The reason that we encountered this issue is due to the fact that the Kaggle data set was already aggregated by company. The EEOC data was parsed out to create sub-categories of the aggregated data set by gender, job category, and race. However, we could not join on aggregated data.

If we were to correct the issue going forward, we would identify a dataset that reported individual instances such as by employee and his/her individual characteristics (gender, job category, and race) so we could more easily manipulate the data through aggreggation and join it to more broad data set such as comparing to state or national measures.

We currently have appended to the manipulated Kaggle dataset the California, Tech sector aggregated data in order to make a tentaive comparison of how Silicon Valley tech employment compares to the California state tech sector measures.

In [39]:
# Create new data sets with select columns

In [40]:
eeoc_nac2_All_df = eeoc_nac2_df[['NAC2_code','NAC2_Label','WHM1','WHM2','WHM3','WHM4','WHM5','WHM6','WHM7','WHM8','WHM9','WHF1','WHF2','WHF3','WHF4','WHF5','WHF6','WHF7','WHF8','WHF9','BLKM1','BLKM2','BLKM3','BLKM4','BLKM5','BLKM6','BLKM7','BLKM8','BLKM9','BLKF1','BLKF2','BLKF3','BLKF4','BLKF5','BLKF6','BLKF7','BLKF8','BLKF9','HISPM1','HISPM2','HISPM3','HISPM4','HISPM5','HISPM6','HISPM7','HISPM8','HISPM9','HISPF1','HISPF2','HISPF3','HISPF4','HISPF5','HISPF6','HISPF7','HISPF8','HISPF9','ASIANM1','ASIANM2','ASIANM3','ASIANM4','ASIANM5','ASIANM6','ASIANM7','ASIANM8','ASIANM9','ASIANF1','ASIANF2','ASIANF3','ASIANF4','ASIANF5','ASIANF6','ASIANF7','ASIANF8','ASIANF9','AIANM1','AIANM2','AIANM3','AIANM4','AIANM5','AIANM6','AIANM7','AIANM8','AIANM9','AIANF1','AIANF2','AIANF3','AIANF4','AIANF5','AIANF6','AIANF7','AIANF8','AIANF9','NHOPIM1','NHOPIM2','NHOPIM3','NHOPIM4','NHOPIM5','NHOPIM6','NHOPIM7','NHOPIM8','NHOPIM9','NHOPIF1','NHOPIF2','NHOPIF3','NHOPIF4','NHOPIF5','NHOPIF6','NHOPIF7','NHOPIF8','NHOPIF9','TOMRM1','TOMRM2','TOMRM3','TOMRM4','TOMRM5','TOMRM6','TOMRM7','TOMRM8','TOMRM9','TOMRF1','TOMRF2','TOMRF3','TOMRF4','TOMRF5','TOMRF6','TOMRF7','TOMRF8','TOMRF9']]
eeoc_nac2_All_df.head()

Unnamed: 0,NAC2_code,NAC2_Label,WHM1,WHM2,WHM3,WHM4,WHM5,WHM6,WHM7,WHM8,...,TOMRM9,TOMRF1,TOMRF2,TOMRF3,TOMRF4,TOMRF5,TOMRF6,TOMRF7,TOMRF8,TOMRF9
0,11,"Agriculture, Forestry, Fishing and Hunting",3311,7254,2369,4740,2561,5636,12178,15711,...,71,6,58,26,35.0,100,5,68,871.0,35
1,21,"Mining, Quarrying, and Oil and Gas Extraction",5039,37459,14647,2088,3999,58589,43730,15476,...,12,6,322,71,10.0,214,13,32,11.0,8
2,22,Utilities,5883,64738,25510,1645,9568,87506,21627,6961,...,45,13,600,63,22.0,682,28,20,9.0,4
3,23,Construction,26632,83888,37250,21458,24350,393797,91062,114587,...,173,43,529,101,251.0,1436,274,88,196.0,52
4,31,Manufacturing,15623,39393,18702,39607,17628,83772,174938,80811,...,731,37,621,213,462.0,816,128,1300,1630.0,755


In [41]:
# Convert wide to narrow table using melt

In [42]:
eeoc_nac2_All_unpivot_df=pd.melt(eeoc_nac2_All_df, id_vars=['NAC2_code','NAC2_Label'], var_name="EEOC_Code", value_name="count").dropna(how='all')
eeoc_nac2_All_unpivot_df.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count
2268,51,Information,NHOPIM5,385.0
1310,53,Real Estate and Rental and Leasing,ASIANM1,320.0
1375,42,Wholesale Trade,ASIANM4,4701.0
2612,71,"Arts, Entertainment, and Recreation",TOMRM1,105.0
2497,21,"Mining, Quarrying, and Oil and Gas Extraction",NHOPIF6,3.0


In [43]:
#Transform EEOC data

In [44]:
eeoc_nac2_All_unpivot_df['gender'] = eeoc_nac2_All_unpivot_df['EEOC_Code'].str[-2].replace({'M' : 'Male', 'F' : 'Female'})
eeoc_nac2_All_unpivot_df.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count,gender
2770,48,Transportation and Warehousing,TOMRM8,2958.0,Male
977,56,Administrative and Support and Waste Managemen...,HISPM5,27952.0,Male
602,22,Utilities,BLKM8,1512.0,Male
1604,71,"Arts, Entertainment, and Recreation",ASIANF4,1854.0,Female
2610,61,Educational Services,TOMRM1,80.0,Male


In [45]:
eeoc_nac2_All_unpivot_df['job_category'] = eeoc_nac2_All_unpivot_df['EEOC_Code'].str[-1].replace({'1' : 'Senior OFF AND MGRS', '2' : 'PROF', '3' : 'TECH', '4' : 'SALE', '5' : 'CLERICALS', '6' : 'CRAFT', '7' : 'OPERS','8' : 'LABORS', '9' : 'Service'}).dropna(how='all')
eeoc_nac2_All_unpivot_df.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count,gender,job_category
1876,31,Manufacturing,AIANM7,2001.0,Male,OPERS
1822,81,Other Services (except Public Administration),AIANM4,97.0,Male,SALE
896,44,Retail Trade,HISPM2,5328.0,Male,PROF
655,42,Wholesale Trade,BLKF1,176.0,Female,Senior OFF AND MGRS
1157,32,Manufacturing,HISPF4,3421.0,Female,SALE


In [46]:
eeoc_nac2_All_unpivot_df['race'] = eeoc_nac2_All_unpivot_df['EEOC_Code'].str[:-2].replace({'WH' : 'White', 'BLK' : 'Black', 'HISP': 'Hispanic_or_Latino','ASIAN' : 'Asian','AIAN' : 'American_Indian_Alaskan_Native','NHOPI' : 'Native_Hawaiian_or_Pacific_Islander','TOMR' : 'Two_or_more_races'}).dropna(how='all')
eeoc_nac2_All_unpivot_df.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count,gender,job_category,race
2538,61,Educational Services,NHOPIF7,10.0,Female,OPERS,Native_Hawaiian_or_Pacific_Islander
2376,11,"Agriculture, Forestry, Fishing and Hunting",NHOPIF1,,Female,Senior OFF AND MGRS,Native_Hawaiian_or_Pacific_Islander
1598,53,Real Estate and Rental and Leasing,ASIANF4,542.0,Female,SALE,Asian
2298,61,Educational Services,NHOPIM6,19.0,Male,CRAFT,Native_Hawaiian_or_Pacific_Islander
508,31,Manufacturing,BLKM4,5085.0,Male,SALE,Black


In [47]:
#Transform Kaggle data

In [48]:
eeoc_kaggle_df.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
2848,Nvidia,2016,American_Indian_Alaskan_Native,female,Sales workers,0
2706,Nvidia,2016,Hispanic_or_Latino,male,Craft workers,0
517,Airbnb,2016,Two_or_more_races,female,Managers,3
2564,MobileIron,2016,Black_or_African_American,male,laborers and helpers,0
3910,View,2016,Native_Hawaiian_or_Pacific_Islander,female,Totals,0


In [49]:
eeoc_kaggle_df['gender'] = eeoc_kaggle_df['gender'].replace({'male' : 'Male', 'female' : 'Female'}).dropna(how='all')
eeoc_kaggle_df.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
906,eBay,2016,Hispanic_or_Latino,Male,Craft workers,0
4382,Sanmina,2016,Black_or_African_American,Male,Technicians,38
3816,View,2016,Black_or_African_American,Male,Executives,0
1947,Intel,2016,American_Indian_Alaskan_Native,Female,Technicians,24
2977,Pinterest,2016,White,Female,Managers,9


In [50]:
eeoc_kaggle_df['job_category'] = eeoc_kaggle_df['job_category'].replace({'Executives' : 'Senior OFF AND MGRS', 'Managers' : 'Senior OFF AND MGRS', 'Professionals' : 'PROF', 'Technicians' : 'TECH', 'Sales workers' : 'SALE', 'Administrative support' : 'CLERICALS', 'Craft workers' : 'CRAFT', 'operatives' : 'OPERS','laborers and helpers' : 'LABORS', 'Service workers' : 'Service'}).dropna(how=all)
eeoc_kaggle_df.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
2503,Lyft,2016,Two_or_more_races,Female,OPERS,0
1085,Facebook,2016,Hispanic_or_Latino,Male,CLERICALS,1
896,Cisco,2016,Overall_totals,,LABORS,0
3700,Uber,2016,White,Female,SALE,1
1194,Facebook,2016,Black_or_African_American,Female,CRAFT,0


In [51]:
eeoc_kaggle_df_clean = eeoc_kaggle_df[(eeoc_kaggle_df["job_category"]!= "Previous_totals") & (eeoc_kaggle_df["job_category"]!= "Overall_totals")&(eeoc_kaggle_df["race"]!= "Previous_totals") & (eeoc_kaggle_df["race"]!= "Overall_totals")]
eeoc_kaggle_df_clean.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
1015,eBay,2016,Black_or_African_American,Female,OPERS,0
64,23andMe,2016,Asian,Male,SALE,0
440,Airbnb,2016,American_Indian_Alaskan_Native,Male,LABORS,0
3331,Square,2016,Two_or_more_races,Male,OPERS,0
2906,Pinterest,2016,White,Male,PROF,197


In [52]:
#join these 2 data sets: EEOC and Kaggle

In [53]:
eeoc_nac2_All_unpivot_df.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count,gender,job_category,race
2417,56,Administrative and Support and Waste Managemen...,NHOPIF2,728.0,Female,PROF,Native_Hawaiian_or_Pacific_Islander
495,54,"Professional, Scientific, and Technical Services",BLKM3,15270.0,Male,TECH,Black
603,23,Construction,BLKM8,28767.0,Male,LABORS,Black
1868,71,"Arts, Entertainment, and Recreation",AIANM6,192.0,Male,CRAFT,American_Indian_Alaskan_Native
203,49,Transportation and Warehousing,WHM9,3023.0,Male,Service,White


In [54]:
eeoc_kaggle_df_clean.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
549,Apple,2016,Hispanic_or_Latino,Male,Service,132
3981,PayPal,2016,Hispanic_or_Latino,Female,Service,0
1327,Google,2016,Asian,Male,OPERS,0
3379,Square,2016,Asian,Female,OPERS,0
144,23andMe,2016,American_Indian_Alaskan_Native,Female,Senior OFF AND MGRS,0


In [55]:
eeoc_female=eeoc_nac2_All_unpivot_df[(eeoc_nac2_All_unpivot_df['NAC2_Label']  =='Professional, Scientific, and Technical Services') & (eeoc_nac2_All_unpivot_df['gender']  =='Female' )] 
eeoc_female.sample(5)

Unnamed: 0,NAC2_code,NAC2_Label,EEOC_Code,count,gender,job_category,race
759,54,"Professional, Scientific, and Technical Services",BLKF5,73360.0,Female,CLERICALS,Black
1647,54,"Professional, Scientific, and Technical Services",ASIANF6,241.0,Female,CRAFT,Asian
1215,54,"Professional, Scientific, and Technical Services",HISPF6,779.0,Female,CRAFT,Hispanic_or_Latino
2031,54,"Professional, Scientific, and Technical Services",AIANF4,254.0,Female,SALE,American_Indian_Alaskan_Native
1239,54,"Professional, Scientific, and Technical Services",HISPF7,4236.0,Female,OPERS,Hispanic_or_Latino


In [56]:
kaggle_female=eeoc_kaggle_df_clean[eeoc_kaggle_df_clean['gender']  =='Female']
kaggle_female.sample(5)

Unnamed: 0,company,year,race,gender,job_category,count
4320,Sanmina,2016,American_Indian_Alaskan_Native,Female,Senior OFF AND MGRS,0
1718,HPE,2016,White,Female,PROF,7869
3224,Salesforce,2016,Two_or_more_races,Female,LABORS,0
4119,PayPal,2016,Two_or_more_races,Female,TECH,0
1050,eBay,2016,American_Indian_Alaskan_Native,Female,CRAFT,0


In [57]:
eeoc_female_total=eeoc_female['count'].astype(int).sum()
eeoc_female_total

#.astype(str).astype(int).sum()

1340758

In [58]:
kaggle_female['CA count']=eeoc_female_total
kaggle_female.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,company,year,race,gender,job_category,count,CA count
1009,eBay,2016,Black_or_African_American,Female,Senior OFF AND MGRS,16,1340758
3012,Pinterest,2016,Asian,Female,Senior OFF AND MGRS,0,1340758
1925,Intel,2016,Native_Hawaiian_or_Pacific_Islander,Female,CLERICALS,0,1340758
2845,Nvidia,2016,American_Indian_Alaskan_Native,Female,Senior OFF AND MGRS,0,1340758
3160,Salesforce,2016,White,Female,SALE,753,1340758


In [59]:
#connect to local database

In [76]:
rds_connection_string = "root:jtf@nfF1749@127.0.0.1/EEOC_db" #hide this
engine = create_engine(f'mysql://{rds_connection_string}')


In [77]:
#create tables
engine.table_names()

['charges_state_2016_final',
 'distributions_data_2016',
 'eeoc_asian',
 'eeoc_female',
 'eeoc_male',
 'eeoc_manager',
 'eeoc_professional',
 'eeoc_white',
 'reveal_eeo1_for_2016',
 'tech_sector_diversity_demographics_2016',
 'year16_cbsa',
 'year16_cbsa_nac2',
 'year16_cbsa_nac3',
 'year16_nac2',
 'year16_nac3',
 'year16_state',
 'year16_state_nac2',
 'year16_state_nac3']

In [78]:
kaggle_female.to_sql(name='eeoc_female', con=engine, if_exists='append', index=False)
# kaggle_female.to_sql(name='eeoc_db.EEOC_Female', con=engine, if_exists='append', index=False)