In [None]:
import json
import pandas as pd
import csv
from src import common

In [None]:
with open('../data/fortune/f500_ranking_2020.json', 'r') as s:
    rankings = json.load(s)

with open('../data/fortune/f500_company_sectors.json', 'r') as s:
    sectors = json.load(s)

with open('../data/fortune/f500_reputation_2020.json', 'r') as s:
    reputations = json.load(s)

companies = []
urls = []
ranks = []
industries = []
for k, v in rankings.items():
    companies.append(k)
    urls.append(v['Website'])
    ranks.append(v['Rank'])
    industries.append(v['Industry'])

dfcompany = pd.DataFrame({
    'firm': companies,
    'url': urls,
    'ranking': ranks,
    'industry': industries
}) 

companies = []
ranks = []
secs = []
for k, v in sectors.items():
    for subk, subv in v.items(): 
        secs.append(k)
        companies.append(subv)
        ranks.append(subk)

dfsector = pd.DataFrame({
    'sector': secs,
    'firm': companies,
    'ranking': ranks
}) 

companies = []
scores = []
for k, v in reputations.items():
    companies.append(k)
    scores.append(v['Overall Score'])
dfreputation = pd.DataFrame({
    'firm': companies,
    'reputation_score': scores
}) 

In [None]:
dfcompany['ranking'] = dfcompany['ranking'].astype(int)
dfsector['ranking'] = dfsector['ranking'].astype(int)

dfcompany = dfcompany.merge(dfsector[['ranking', 'sector']], how='left')
dfcompany = dfcompany.merge(dfreputation, on='firm', how='left')

sectors_to_include = [
    'Business Services',
    'Financials',
    'Energy',
    'Retailing',
    'Technology',
    'Media',
    'Health Care',
    'Transportation',
    'Industrials',
    'Household Products',
    'Telecommunications'
]

dfcompany['include'] = False
dfcompany.loc[lambda x: x['sector'].isin(sectors_to_include), 'include'] = True

dfcompany['ranklabel'] = '501-1000'
dfcompany.loc[lambda x: x['ranking']<=500, 'ranklabel'] = '301-500'
dfcompany.loc[lambda x: x['ranking']<=300, 'ranklabel'] = '201-300'
dfcompany.loc[lambda x: x['ranking']<=200, 'ranklabel'] = '1-200'

dfcompany = dfcompany.drop_duplicates(subset=['ranking'], keep='first') # to do: check those

dfcompany['firmhash'] = [common.__hash(f) for f in dfcompany['firm'].values]

In [None]:
output_columns = [
    'firmhash',
    'firm',
    'url',
    'ranking',
    'industry',
    'sector',
    'reputation_score',
    'include',
    'ranklabel',
]

In [None]:
if False:
    dfcompany[output_columns].to_csv('../data/fortune/f500_firm_sample.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

## check numbers

In [None]:
len(dfcompany.loc[lambda x: (x['ranking']<=300) & (x['include']==True) & (~x['reputation_score'].isnull())])

In [None]:
samplecount_per_industry = dfcompany.loc[lambda x: (x['ranking']<=300) & (x['include']==True) & (~x['reputation_score'].isnull())].groupby(['sector'])[['firm']].count()

In [None]:
samplecount_per_industry['firm'].sum()

In [None]:
samplecount_per_industry.sort_values('firm', ascending=False)