In [121]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np

# FOR WEBSCRAPING 
import requests
import requests_ftp
import requests_cache
import lxml
import re
import urllib
from bs4 import BeautifulSoup
from collections import Counter

# PLOTTING
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# CACHE
requests_cache.install_cache('urban_cache')

In [2]:
# WEBSCRAPING FAIRYGODBOSS
url_req = requests.get("https://fairygodboss.com/maternity-leave-resource-center")
html = url_req.text
soup = BeautifulSoup(html, 'lxml')

In [6]:
company_info_all = []
for i in range(80): # we know that "end" page on the site is 79
    base_url = 'https://fairygodboss.com/maternity-leave-resource-center'
    page = base_url + '?page=' + str(i)
    url_req = requests.get(page)
    html = url_req.text
    soup = BeautifulSoup(html, 'lxml')
    company_info = soup.select('a[href*="/company-overview/"]') #after inspecting element, this is where our info is 
    company_info_all.append(company_info)
#company_info_all[1]

In [89]:
# CREATE EMPTY LISTS FOR DATAFRAME: we need company name, industry, and 2 types of leave for each type of parent 
name = []
industry = []
mat_leave_pd = []
mat_leave_unp = []
pat_leave_pd = []
pat_leave_unp = []
for i in range(len(company_info_all)):
    company_info = company_info_all[i] #extract element from list
    for i in range(len(company_info)):
        s = company_info[i] #extract element from sublist to get relevant information
        #print s
        
        for i in range(len(s)):
            
            #s.find_all('li') has 6 different elements that we want, so we can use the indices 0:5 to find them
            name.append(s.find_all('li')[0].text.strip("\n"))
            industry.append(s.find_all('li')[1].text.split(' \n')[1].strip())
            mat_leave_pd.append(s.find_all('li')[2].text.split(' \n')[1].strip())
            mat_leave_unp.append(s.find_all('li')[3].text.split(' \n')[1].strip())
            pat_leave_pd.append(s.find_all('li')[4].text.split(' \n')[1].strip())
            pat_leave_unp.append(s.find_all('li')[5].text.split(' \n')[1].strip())

# CREATE DATAFRAME
df = pd.DataFrame({"name":name, "industry":industry, "maternity leave paid":mat_leave_pd, "maternity leave unpaid":mat_leave_unp, "paternity leave paid":pat_leave_pd, "paternity leave unpaid":pat_leave_unp})
df.dropna()

# DROP DUPLICATES, KEEPING ONLY ONE OF EACH COMPANY
df.drop_duplicates(subset = ['name'], keep = 'first', inplace = True)
df['sector'] = 

Unnamed: 0,industry,maternity leave paid,maternity leave unpaid,name,paternity leave paid,paternity leave unpaid
0,Technology: Consumer Internet,52,0,Netflix,52,0
13,Philanthropy,52,,Bill and Melinda Gates Foundation,52,2
26,Government: Federal,39,13,Army (British),2,
39,Technology: Consumer Internet,32,0,"Automattic, Inc.",,
52,Technology: Consumer Internet,26,,Etsy,26,
65,Technology: Consumer Internet,26,,Spotify,26,
78,Technology: Software,26,6,Adobe Systems,16,
91,Retail: Online,26,,AO.com,,
104,Technology: Software,26,,Organizer Inc.,,
117,Technology: Payments,26,,Worldpay,,


In [90]:
#df #we could poentially take the mean of each paid matleave and patleave and bargraph it according to each industry. hell yeah
df['industry'].value_counts()

Law Firm                                       88
Technology: Software                           81
Educational Services: College & Universi...    77
Technology: Consumer Internet                  77
Healthcare: Hospitals & Clinics                61
Technology: B2B Tech Services                  57
Finance: Diversified                           51
Retail: Shoes, Accessories and Apparel         44
Advertising                                    41
Pharmaceutical                                 38
Consulting Services                            36
Finance: Asset Management                      33
Technology: Manufacturing                      25
Hospitality: Restaurants                       24
Healthcare: Medical Devices                    23
Media: Diversified                             20
Consumer Packaged Goods: Packaged Foods        20
Telecommunications                             19
Natural Resources: Oil & Gas                   19
Insurance: Property & Casualty                 19


`df` is the dataframe I want to work with. The relevant information given by this dataset is industry, company name, and maternity and paternity leave (paid and unpaid for both). For the purposes of our analysis, we are going to focus on paid parental leave and see how this plays a role in social trends. 

First, as a summary, we want to determine how many types of industries there are in our dataframe and how many companies there are per industry. We can do this by just counting the unique values. A couple issues with this are that there are no duplicate industries at all, from a pandas perspective. When we look at it visually, we see that there are "groups" of industries such a "Technology: Security" and "Technology: Software", etc. First, we were going to create a column called `df['sector']` with industry names up till the ":" (using regex), but we decided it would be a better idea to just make the `sector` column by extracting the first word of every industry string. 

Since the Pew Research institute published studies primarily on Politics and Business, we have identified 4 sectors we want to look at: Technology, Government, Business, and Nonprofit. We chose the government and business sectors so we could see if it will shed some quantitative shining light on Pew's findings, technology because it has the largest sample of companies but is also an extremely relevant and growing industry. We are also interested in looking at the nonprofit sector. 

Now that we have our dataframe with "sector", we will create a new dataframe that only has the 5 sectors that we want and call it `sectordf`. 

In [135]:
# GET ALL ROWS FOR WHICH THE INDUSTRY STARTS WITH ONE OF OUR 4 CRITERIA
sectordf = df.loc[df.industry.str.startswith(("Technology", "Nonprofit", "Business", "Government")), :]
sectordf.apply(lambda x: pd.to_numeric(x, errors = 'ignore'))

Unnamed: 0,industry,maternity leave paid,maternity leave unpaid,name,paternity leave paid,paternity leave unpaid
0,Technology: Consumer Internet,52,0,Netflix,52,0
26,Government: Federal,39,13,Army (British),2,
39,Technology: Consumer Internet,32,0,"Automattic, Inc.",,
52,Technology: Consumer Internet,26,,Etsy,26,
65,Technology: Consumer Internet,26,,Spotify,26,
78,Technology: Software,26,6,Adobe Systems,16,
104,Technology: Software,26,,Organizer Inc.,,
117,Technology: Payments,26,,Worldpay,,
130,Technology: Gaming,26,12,Zynga,12,
143,Technology: Consumer Internet,24,6,eBay Inc.,12,


For now, we're interested in making a bar plot of maternity leave for the technology industry. Let's see how to do that. 

In [138]:
pd.to_numeric(sectordf.loc['maternity leave paid'], errors='coerce')

KeyError: 'the label [maternity leave paid] is not in the [index]'

In [129]:
#Distributions
#dists=sns.boxplot("Avg Price per Lb", "Type", data = df3)
#plt.show(dists)

#include a column for "sector" 
#strip the colons 
sectordf['maternity leave paid']
#techs = sns.boxplot("maternity leave paid", "industry", data = sectordf)
#plt.show(techs)

AttributeError: 'Series' object has no attribute 'ntype'

In [140]:
1/float(1+3)

0.25