In [None]:
import json
import pandas as pd
import csv
from src import common

In [None]:
with open('../data/fortune/f500_ranking_2020.json', 'r') as s:
    rankings = json.load(s)

with open('../data/fortune/f500_company_sectors.json', 'r') as s:
    sectors = json.load(s)

with open('../data/fortune/f500_reputation_2020.json', 'r') as s:
    reputations2020 = json.load(s)
    
with open('../data/fortune/f500_reputation_2019.json', 'r') as s:
    reputations2019 = json.load(s)

companies = []
urls = []
ranks = []
industries = []
metrics = []
for k, v in rankings.items():
    companies.append(k)
    urls.append(v['Website'])
    ranks.append(v['Rank'])
    industries.append(v['Industry'])
    row = []
    for m in ['Revenues ($M)', 'Profits ($M)', 'Market Value ($M)', 'Employees']:
        try:
            before_decimal = int(''.join(filter(str.isdigit, v[m].split('.')[0])))
            try:
                after_decimal = int(''.join(filter(str.isdigit, v[m].split('.')[1]))) / 10
            except:
                after_decimal = 0
            total = before_decimal + after_decimal
        except:
            total = None
        row.append(total)
    metrics.append(row)
        

dfcompany = pd.DataFrame({
    'firm': companies,
    'url': urls,
    'ranking': ranks,
    'industry': industries,
    'revenue_in_millions': [x[0] for x in metrics],
    'profits_in_millions': [x[1] for x in metrics],
    'market_value_in_millions': [x[2] for x in metrics],
    'n_employees': [x[3] for x in metrics],
})


companies = []
ranks = []
secs = []
for k, v in sectors.items():
    for subk, subv in v.items(): 
        secs.append(k)
        companies.append(subv)
        ranks.append(subk)

dfsector = pd.DataFrame({
    'sector': secs,
    'firm': companies,
    'ranking': ranks
}) 

companies = []
scores = []
for k, v in reputations2020.items():
    if len(v) > 0:
        companies.append(k)
        scores.append(v['Overall Score'])
dfreputation2020 = pd.DataFrame({
    'firm': companies,
    'reputation_score_2020': scores
})

companies = []
scores = []
for k, v in reputations2019.items():
    if len(v) > 0:
        companies.append(k)
        scores.append(v['Overall Score'])
dfreputation2019 = pd.DataFrame({
    'firm': companies,
    'reputation_score_2019': scores
})

dfreputation = dfreputation2020.merge(dfreputation2019, how='left', on='firm')

In [None]:
dfcompany['ranking'] = dfcompany['ranking'].astype(int)
dfsector['ranking'] = dfsector['ranking'].astype(int)

dfcompany = dfcompany.merge(dfsector[['ranking', 'sector']], how='left')
dfcompany = dfcompany.merge(dfreputation, on='firm', how='left')

sectors_to_include = [
    'Business Services',
    'Financials',
    'Energy',
    'Retailing',
    'Technology',
    'Media',
    'Health Care',
    'Transportation',
    'Industrials',
    'Household Products',
    'Telecommunications'
]

dfcompany['include'] = False
dfcompany.loc[lambda x: x['sector'].isin(sectors_to_include), 'include'] = True

dfcompany['ranklabel'] = '501-1000'
dfcompany.loc[lambda x: x['ranking']<=500, 'ranklabel'] = '301-500'
dfcompany.loc[lambda x: x['ranking']<=300, 'ranklabel'] = '201-300'
dfcompany.loc[lambda x: x['ranking']<=200, 'ranklabel'] = '1-200'

dfcompany = dfcompany.drop_duplicates(subset=['ranking'], keep='first') # to do: check those

dfcompany['final_sample'] = False
dfcompany.loc[lambda x: (x['include']==True) & (~x['reputation_score_2020'].isnull()) & (~x['reputation_score_2019'].isnull()), 'final_sample'] = True

dfcompany['firmhash'] = [common.__hash(f) for f in dfcompany['firm'].values]

In [None]:
dfcompany.columns

In [None]:
output_columns_sample = [
    'firmhash',
    'firm',
    'url',
    'ranking',
    'industry',
    'sector',
    'include',
    'ranklabel',
]
output_columns_all = [
    'firmhash',
    'firm',
    'url',
    'ranking',
    'industry',
    'sector',
    'revenue_in_millions',
    'profits_in_millions',
    'market_value_in_millions',
    'n_employees',
    'reputation_score_2020',
    'reputation_score_2019',
    'include',
    'ranklabel',
]

In [None]:
if True:
    dfcompany.loc[lambda x: x['final_sample']][output_columns_sample].to_csv('../data/fortune/f500_final_firm_sample.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')
    dfcompany[output_columns_sample].to_csv('../data/fortune/f500_firm_sample.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')
    dfcompany[output_columns_all].to_csv('../data/fortune/f500_full_firm_data.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

## check numbers

In [None]:
initial_sample = dfcompany.loc[lambda x: (x['ranking']<=300) & (x['include']==True) & (~x['reputation_score_2020'].isnull()) & (~x['reputation_score_2019'].isnull())]

In [None]:
len(initial_sample)

In [None]:
ideal_sample = dfcompany.loc[lambda x: (x['include']==True) & (~x['reputation_score_2020'].isnull()) & (~x['reputation_score_2019'].isnull())]

In [None]:
len(ideal_sample)

In [None]:
additional_sample = ideal_sample.loc[lambda x: ~x['firm'].isin(list(initial_sample['firm'].values))]

In [None]:
if True:
    additional_sample[output_columns_sample].to_csv('../data/fortune/f500_firm_sample_additional.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

In [None]:
len(additional_sample)

In [None]:
samplecount_per_industry = dfcompany.loc[lambda x: (x['ranking']<=300) & (x['include']==True) & (~x['reputation_score_2020'].isnull()) & (~x['reputation_score_2019'].isnull())].groupby(['sector'])[['firm']].count()

In [None]:
samplecount_per_industry['firm'].sum()

In [None]:
samplecount_per_industry.sort_values('firm', ascending=False)