In [1]:
import pandas as pd
import numpy as np
import json
import re

In [2]:
with open('../data/stock_info.json', 'r') as f:
    # Data is a string containing a json dictionary of key:dictionary pairs.
    # We can convert this to pandas by taking the key as index for each row and the value (dictionary) as columns.
    # 'index' : dict like {index -> {column -> value}}
    df = pd.read_json(f, orient='index')


In [3]:
df.head()

Unnamed: 0,zip,sector,fullTimeEmployees,longBusinessSummary,city,phone,state,country,companyOfficers,website,...,logo_url,address2,fax,underlyingSymbol,underlyingExchangeSymbol,headSymbol,uuid,err,address3,averageMaturity
SHW,44115-1075,Basic Materials,61031.0,"The Sherwin-Williams Company develops, manufac...",Cleveland,216 566 2000,OH,United States,[],http://www.sherwin-williams.com,...,https://logo.clearbit.com/sherwin-williams.com,,,,,,,,,
AMZN,98109-5210,Consumer Cyclical,1271000.0,"Amazon.com, Inc. engages in the retail sale of...",Seattle,206-266-1000,WA,United States,[],http://www.amazon.com,...,https://logo.clearbit.com/amazon.com,,,,,,,,,
AXDX,85714,Healthcare,224.0,"Accelerate Diagnostics, Inc., an in vitro diag...",Tucson,520 365 3100,AZ,United States,[],http://acceleratediagnostics.com,...,https://logo.clearbit.com/acceleratediagnostic...,Suite 470 4th Floor,,,,,,,,
,60606,Financial Services,,Nuveen New York Quality Municipal Income Fund ...,Chicago,312-917-7700,IL,United States,[],http://www.nuveen.com/CEF/Product/Overview.asp...,...,https://logo.clearbit.com/nuveen.com,,312-917-6912,,,,,,,
HD,30339,Consumer Cyclical,504800.0,"The Home Depot, Inc. operates as a home improv...",Atlanta,770-433-8211,GA,United States,[],http://www.homedepot.com,...,https://logo.clearbit.com/homedepot.com,,,,,,,,,


In [4]:
df.describe()

Unnamed: 0,fullTimeEmployees,maxAge,previousClose,regularMarketOpen,twoHundredDayAverage,trailingAnnualDividendYield,payoutRatio,volume24Hr,regularMarketDayHigh,navPrice,...,lastCapGain,shortPercentOfFloat,sharesShortPriorMonth,impliedSharesOutstanding,fiveYearAverageReturn,regularMarketPrice,underlyingSymbol,underlyingExchangeSymbol,headSymbol,averageMaturity
count,3665.0,5586.0,5576.0,5534.0,5559.0,2223.0,3727.0,0.0,5536.0,1274.0,...,40.0,3532.0,4031.0,3.0,958.0,5579.0,0.0,0.0,0.0,2.0
mean,15084.94,1.0,100.557457,99.787922,79.516873,0.443861,0.544138,,101.008485,64.387684,...,0.0,0.288163,4945351.0,425604000.0,0.085052,100.12282,,,,37.5
std,59062.62,0.0,2817.200412,2827.784592,1654.428089,7.870622,6.741663,,2827.355384,56.737929,...,0.0,13.886621,10174200.0,405103400.0,0.155193,2816.348353,,,,6.363961
min,1.0,1.0,0.0,0.0,0.000295,0.0,0.0,,0.0,2.96,...,0.0,0.0,0.0,78090000.0,-0.8478,0.0,,,,33.0
25%,314.0,1.0,13.485,13.35,12.274445,0.008105,0.0,,13.7,29.2,...,0.0,0.0164,485703.0,203139500.0,0.03915,13.5,,,,35.25
50%,1925.0,1.0,31.335,30.825,28.307384,0.020204,0.0,,31.6275,48.575,...,0.0,0.0333,1862540.0,328189000.0,0.09855,31.235,,,,37.5
75%,9000.0,1.0,67.615,66.505,60.347603,0.03943,0.38535,,67.68125,78.24,...,0.0,0.0718,5151061.0,599361000.0,0.157225,66.99,,,,39.75
max,2300000.0,1.0,210000.0,210000.0,122805.76,320.41013,399.84012,,210000.0,497.73,...,0.0,825.3376,174798900.0,870533000.0,0.8842,210000.0,,,,42.0


In [5]:
# List all the sectors
df['sector'].unique()

array(['Basic Materials', 'Consumer Cyclical', 'Healthcare',
       'Financial Services', 'Technology', 'Communication Services',
       'Utilities', 'Industrials', 'Consumer Defensive', 'Energy',
       'Real Estate', '', 'Financial', nan], dtype=object)

In [6]:
df['longBusinessSummary']

SHW         The Sherwin-Williams Company develops, manufac...
AMZN        Amazon.com, Inc. engages in the retail sale of...
AXDX        Accelerate Diagnostics, Inc., an in vitro diag...
NaN         Nuveen New York Quality Municipal Income Fund ...
HD          The Home Depot, Inc. operates as a home improv...
                                  ...                        
SBE/WS                                                    NaN
TDW/WS/A                                                  NaN
TDW/WS/B                                                  NaN
TUQ1                                                      NaN
VST/WS/A                                                  NaN
Name: longBusinessSummary, Length: 7145, dtype: object

In [7]:
len(df['longBusinessSummary'].unique())

5431

In [8]:
df['longBusinessSummary'].to_excel('../data/business_summaries.xlsx')

In [9]:
def match_func(text):
    text = str(text)
    m = re.search(r'was founded in (\d{4}).*|was incorporated in (\d{4}).*', text)
    if m:
        year = m.group(1) if m.group(1) else m.group(2)
        return year
    m = re.search(r'was formed on [A-Za-z]+ \d+, (\d{4}).*', text)
    if m:
        return m.group(1)
    return None

# Many texts contain this exact string:
# The company was founded in XXXX and is headquartered in YYYY, ZZZZ.
df['year_of_foundation']=df['longBusinessSummary'].apply(match_func)
df.head(20)

Unnamed: 0,zip,sector,fullTimeEmployees,longBusinessSummary,city,phone,state,country,companyOfficers,website,...,address2,fax,underlyingSymbol,underlyingExchangeSymbol,headSymbol,uuid,err,address3,averageMaturity,year_of_foundation
SHW,44115-1075,Basic Materials,61031.0,"The Sherwin-Williams Company develops, manufac...",Cleveland,216 566 2000,OH,United States,[],http://www.sherwin-williams.com,...,,,,,,,,,,1866
AMZN,98109-5210,Consumer Cyclical,1271000.0,"Amazon.com, Inc. engages in the retail sale of...",Seattle,206-266-1000,WA,United States,[],http://www.amazon.com,...,,,,,,,,,,1994
AXDX,85714,Healthcare,224.0,"Accelerate Diagnostics, Inc., an in vitro diag...",Tucson,520 365 3100,AZ,United States,[],http://acceleratediagnostics.com,...,Suite 470 4th Floor,,,,,,,,,1982
,60606,Financial Services,,Nuveen New York Quality Municipal Income Fund ...,Chicago,312-917-7700,IL,United States,[],http://www.nuveen.com/CEF/Product/Overview.asp...,...,,312-917-6912,,,,,,,,1999
HD,30339,Consumer Cyclical,504800.0,"The Home Depot, Inc. operates as a home improv...",Atlanta,770-433-8211,GA,United States,[],http://www.homedepot.com,...,,,,,,,,,,1978
PYPL,95131,Financial Services,26500.0,"PayPal Holdings, Inc. operates as a technology...",San Jose,408-967-1000,CA,United States,[],http://www.paypal.com,...,,,,,,,,,,1998
AAPL,95014,Technology,100000.0,"Apple Inc. designs, manufactures, and markets ...",Cupertino,408-996-1010,CA,United States,[],http://www.apple.com,...,,,,,,,,,,1977
MSFT,98052-6399,Technology,163000.0,"Microsoft Corporation develops, licenses, and ...",Redmond,425 882 8080,WA,United States,[],http://www.microsoft.com,...,,,,,,,,,,1975
TJX,01701,Consumer Cyclical,320000.0,"The TJX Companies, Inc., together with its sub...",Framingham,508 390 1000,MA,United States,[],http://www.tjx.com,...,,,,,,,,,,1956
DIS,91521,Communication Services,175000.0,"The Walt Disney Company, together with its sub...",Burbank,818-560-1000,CA,United States,[],http://www.thewaltdisneycompany.com,...,,,,,,,,,,1923


In [10]:
df['year_of_foundation'].isna().sum()

3379

In [11]:
df.to_excel('../data/investees_info.xlsx')