In [1]:
import os
import re
from glob import glob

import numpy as np
import pandas as pd

DATA_DIR = os.path.join(os.environ["HOME"], "Datasets",
                        "Financial","SEC_10K")

ML_DATA_DIR = os.path.join(DATA_DIR, "ML_Dataset")

# Read Data

In [2]:
sec10k = pd.read_csv(os.path.join(ML_DATA_DIR, "SEC_10K_CIK_TickerProfile.csv"), dtype = {"CIK":"str"})

display(sec10k.shape)
display(sec10k.head())

(17095, 11)

Unnamed: 0,ID_CIK_YEAR,CIK,YEAR,CLEANED_ITEM_1A,CLEANED_ITEM_7,CLEANED_ITEM_7A,SP_500,SYMBOL,COMPANY,COUNTRY,INDUSTRY
0,ID_0000008858_2019,8858,2019,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,0,AVT,AVNET INC. COMMON STOCK,UNITED STATES,ELECTRONIC COMPONENTS
1,ID_0000067347_2021,67347,2021,risks inherent business risks uncertainties de...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,MOD,MODINE MANUFACTURING COMPANY COMMON STOCK,UNITED STATES,AUTO PARTS:O.E.M.
2,ID_0000030697_2018,30697,2018,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,0,WEN,WENDY'S COMPANY (THE) COMMON STOCK,UNITED STATES,RESTAURANTS
3,ID_0000072444_2017,72444,2017,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk dollars millions exposure market risk inc...,0,VXRT,VAXART INC COMMON STOCK,UNITED STATES,BIOTECHNOLOGY: BIOLOGICAL PRODUCTS (NO DIAGNOS...
4,ID_0000065596_2020,65596,2020,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,SIEB,SIEBERT FINANCIAL CORP. COMMON STOCK,UNITED STATES,INVESTMENT BANKERS/BROKERS/SERVICE


In [4]:
cat_df = pd.DataFrame(sec10k["INDUSTRY"].value_counts())
cat_df = cat_df.reset_index()
cat_df.columns = ["INDUSTRY","FREQUENCY"]
cat_df

Unnamed: 0,INDUSTRY,FREQUENCY
0,MAJOR BANKS,1130
1,BIOTECHNOLOGY: PHARMACEUTICAL PREPARATIONS,840
2,REAL ESTATE INVESTMENT TRUSTS,645
3,INDUSTRIAL MACHINERY/COMPONENTS,425
4,EDP SERVICES,420
...,...,...
142,PRECISION INSTRUMENTS,5
143,DURABLE GOODS,5
144,SERVICES-MISC. AMUSEMENT & RECREATION,5
145,DIVERSIFIED ELECTRONIC PRODUCTS,5


Remove UNSPECIFIED from Country and Industry but only on SP_500 = 0

In [5]:
def improve_data(input_df):
    # REMOVE unspecified
    sec10k_df = input_df.copy()
    display(sec10k_df.shape)
    # Banks, Finance, Savings into a Single Category
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("BANK|FINANCE|SAVINGS|INVESTMENT") == True,
                                     "FINANCIAL", sec10k_df["INDUSTRY"])
    # BIOTECHNOLOGY into a single category
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("BIOTECHNOLOGY") == True,
                                     "BIOTECHNOLOGY", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # Power, Utilities, Water into UTILITIES
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("POWER|UTILITIES|WATER|ELECTRICITY") == True,
                                     "UTILITIES", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # OIL, OILFIELD into ENERGY
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("OIL|OILFIELD|COAL") == True,
                                     "ENERGY", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # HOMEBUILDING, ESTATE, PROPERTY into REAL ESTATE
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("HOME|HOMEBUILDING|ESTATE|PROPERTY") == True,
                                     "REAL ESTATE", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    #  RETAIL, STORES to RETAIL
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("RETAIL|STORES") == True,
                                     "RETAIL", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # FREIGHT into FREIGHT
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("FREIGHT") == True,
                                     "FREIGHT", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # FOOD, BEVERAGE to FOOD AND BEVERAGE
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("FOOD|BEVERAGE") == True,
                                     "FOOD AND BEVERAGE", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    # RAILROADS, TRANSPORTATION to TRANSPORTATION
    sec10k_df["INDUSTRY"] = np.where(sec10k_df["INDUSTRY"].str.contains("RAILROADS|TRANSPORTATION") == True,
                                     "TRANSPORTATION", sec10k_df["INDUSTRY"])

    display(sec10k_df.shape)
    display(sec10k_df.head(10))
    return sec10k_df


# Improve Main Data

In [6]:
# Write to CSV
sec10k_impv = improve_data(sec10k)
sec10k_impv.to_csv(os.path.join(ML_DATA_DIR, "SEC_10K_CIK_TickerProfile_Improved.csv"), index = False)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

(17095, 11)

Unnamed: 0,ID_CIK_YEAR,CIK,YEAR,CLEANED_ITEM_1A,CLEANED_ITEM_7,CLEANED_ITEM_7A,SP_500,SYMBOL,COMPANY,COUNTRY,INDUSTRY
0,ID_0000008858_2019,8858,2019,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,0,AVT,AVNET INC. COMMON STOCK,UNITED STATES,ELECTRONIC COMPONENTS
1,ID_0000067347_2021,67347,2021,risks inherent business risks uncertainties de...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,MOD,MODINE MANUFACTURING COMPANY COMMON STOCK,UNITED STATES,AUTO PARTS:O.E.M.
2,ID_0000030697_2018,30697,2018,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,0,WEN,WENDY'S COMPANY (THE) COMMON STOCK,UNITED STATES,RESTAURANTS
3,ID_0000072444_2017,72444,2017,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk dollars millions exposure market risk inc...,0,VXRT,VAXART INC COMMON STOCK,UNITED STATES,BIOTECHNOLOGY
4,ID_0000065596_2020,65596,2020,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,SIEB,SIEBERT FINANCIAL CORP. COMMON STOCK,UNITED STATES,FINANCIAL
5,ID_0000072162_2020,72162,2020,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,NL,NL INDUSTRIES INC. COMMON STOCK,UNITED STATES,DIVERSIFIED COMMERCIAL SERVICES
6,ID_0000069488_2020,69488,2020,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,MYE,MYERS INDUSTRIES INC. COMMON STOCK,UNITED STATES,AUTO PARTS:O.E.M.
7,ID_0000059440_2018,59440,2018,principal risks inherent business affected fac...,condition results operations dollars millions ...,risk exposure market risk includes fluctuating...,0,VGR,VECTOR GROUP LTD. COMMON STOCK,UNITED STATES,FARMING/SEEDS/MILLING
8,ID_0000033533_2020,33533,2020,risks inherent businesswe affected factors adv...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,,,,
9,ID_0000005513_2021,5513,2021,risks inherent business risks uncertainties de...,condition results operations dollars millions ...,exposure market risk includes fluctuating inte...,0,UNM,UNUM GROUP COMMON STOCK,UNITED STATES,ACCIDENT &HEALTH INSURANCE


In [7]:
sec10k_impv["INDUSTRY"].value_counts()

FINANCIAL                                         2475
BIOTECHNOLOGY                                     1285
ENERGY                                             570
REAL ESTATE                                        465
INDUSTRIAL MACHINERY/COMPONENTS                    425
                                                  ... 
EDP PERIPHERALS                                      5
SERVICES-MISC. AMUSEMENT & RECREATION                5
TOOLS/HARDWARE                                       5
MOTOR VEHICLES                                       5
COMPUTER SOFTWARE: PROGRAMMING DATA PROCESSING       5
Name: INDUSTRY, Length: 108, dtype: int64

In [8]:
sec10k_impv["SP_500"].value_counts()

0    14415
1     2680
Name: SP_500, dtype: int64

# Improve Candidates Dataset

In [None]:
candidates = pd.read_csv(os.path.join(ML_DATA_DIR, "SEC_10K_CIK_TickerProfile_Candidates.csv"), dtype = {"CIK":"str"})

candidates_impv = improve_data(candidates)

candidates_impv.to_csv(os.path.join(ML_DATA_DIR, "SEC_10K_CIK_TickerProfile_Candidates_Improved.csv"), index = False)