In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import subprocess
from os import listdir
from os.path import isfile, join
import nltk
import warnings
warnings.filterwarnings('ignore')

#### We extract the two variables below from 10k reports in this notebook. 

1. V6 = Changes of management (time of the changes); -– 10K / 10Q REPORT

2. V9 =  Merger and acquisition; – 10K / 10Q REPORT 

In [2]:
CIK = '1373835'
directory = '/pylon5/tr5pi7p/suli2020/rebranding/10K/' + CIK + '/10-K/'
directory

'/pylon5/tr5pi7p/suli2020/rebranding/10K/1373835/10-K/'

In [3]:
#%cd /pylon5/tr5pi7p/suli2020/rebranding/10K/1373835/10-K

In [4]:
def read_file(filename):
    """
    input: filename
    output: text [string]
    """
    filename = directory + filename
    f = open(filename, "r")
    text = f.read()
    return text

def read_directory(directory):
    """
    input: directory 
    output: text from all files under the directory [list]
    """
    files = [f for f in listdir(directory) if isfile(join(directory, f))]
    texts = [read_file(f) for f in files]
    return texts

def find_word_freq(text, word):
    """
    input
        - text: text where we want to count how many times the word appears in
        - word: the word we are interested in 
    output: number of times [int]
    """
    return len(re.findall(word, text))

def find_word_sentence(text, word):
    """
    input
        - text: text where we want to count how many times the word appears in
        - word: the word we are interested in 
    output: a list of sentences from the text where the word appear in [list]
    """
    sentences = text.split(".")
    result = []
    for s in sentences:
        if word in s:
            result.append(s.strip())
    return result

def find_table(soup, word):
    """
    summary: extract management table from soup object
    parameters: 
        soup [beautiful soup object]
        word [string]
    return:
        table [beautiful soup tag]
    """
    tables = soup.find_all("table")
    for t in tables:
        if "Chief Executive Officer" in t.text:
            return t

def parse_name_from_table(table):
    """
    summary: main function for  
    parameters:
        - table
    return:
        - a dictionary of position and person name if extract successful 
        - "unable to extract" if unsuccessful 
    """
    dic = {}
    
    if len(table.find_all('tr')) != 0:
        rows = table.find_all('tr')
        for r in rows:
            text = r.text
            text = re.sub(r'\([^)]*\)', '', text)
            match = re.match(r"([^0-9]+)([0-9]+)([^0-9]+)", text, re.I)
            if match:
                words = match.groups()
                if len(words) == 3:
                    words = [w.strip().strip("\xa0") for w in words]
                    words = [re.sub(r'\xa0', '', w) for w in words]
                    words = [re.sub(r'\n', '', w) for w in words]
                    if words[2] in dic:
                        dic[words[2]].append(words[0])
                    else:
                        dic[words[2]] = [words[0]]
    if len(dic) > 2:
        return dic
    if len(table.find_all('td')) != 0:
        rows = table.find_all('tr')
        data = []
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele])
        for words in data:
            if len(words) == 3:
                words = [w.strip().strip("\xa0") for w in words]
                words = [re.sub(r'\xa0', '', w) for w in words]
                words = [re.sub(r'\n', '', w) for w in words]
                if words[2] in dic:
                    dic[words[2]].append(words[0])
                else:
                    dic[words[2]] = [words[0]]
    else:
        rows = str(table).split('\n')
        for r in rows:
            if not r.startswith("<"):
                words = [w.strip().strip("\xa0") for w in r.split("  ") if w]
                words = [re.sub(r'\xa0', '', w) for w in words]
                words = [re.sub(r'\n', '', w) for w in words]
                if len(words) == 4:
                    try:
                        if words[2] in dic:
                            dic[words[2]].append(words[0])
                        else:
                            dic[words[2]] = [words[0]]
                    except:
                        print(words)
    if len(dic) > 3:
        return dic
    return "unable to extract"

def merger_acquisition(text):
    
    lst = ["merger", "merge", "acquire", "acquisition"]
    sno = nltk.stem.SnowballStemmer('english')
    lst = list(set([sno.stem(w) for w in lst]))
#     print(lst)
    words = text.lower().split()
    words = [sno.stem(w) for w in words]
    count = 0 
    for l in lst:
        count += words.count(l)
    return count

In [5]:
texts = read_directory(directory)

In [6]:
# V9 = Merger and acquisition
merge_freq = []
for i in range(len(texts)):
    text = texts[i]
    count = merger_acquisition(text)
    merge_freq.append(count)

In [7]:
# V6 = Changes of management (time of the changes)
management = []
for i in range(len(texts)):
#     print("+++++++++++++++++++++++")
    text = texts[i]
    soup = BeautifulSoup(text)
    tables = soup.find_all("table")
    selected_table = find_table(soup, "table")
    dic = parse_name_from_table(selected_table)
    management.append(dic)

In [8]:
allFile = listdir(directory)
yearToMgmt = {}
for i in range(len(allFile)):
    fileName = allFile[i]
    year = fileName.split('-')[1]
    if year.startswith('0') or year.startswith('1'):
        year = '20' + year
    else:
        year = '19' + year
    year = int(year)
    yearToMgmt[year] = management[i]
yearToMgmt = dict(sorted(yearToMgmt.items()))

In [9]:
csv = pd.DataFrame(columns=['Year', 'CIK', 'V6', 'V9'])
for i in range(len(yearToMgmt)):
    
    if merge_freq[i] > 500:
        merged = 1
    else:
        merged = 0
    
    year = list(yearToMgmt.keys())[i]
    if i == 0:
        df = pd.DataFrame([[year, CIK, 0, merged]], columns=['Year', 'CIK', 'V6', 'V9'])
        csv = csv.append(df)
    else:
        prevMgmt = yearToMgmt[list(yearToMgmt.keys())[i-1]]
        currMgmt = yearToMgmt[year]
        changed = False
        for prevKey in prevMgmt.keys():
            prev = prevMgmt[prevKey]
            if prevKey in currMgmt:
                curr = currMgmt[prevKey]
            else:
                changed = True
                break
            for name in prev:
                if not name in curr:
                    changed = True
                    break
            for name in curr:
                if not name in prev:
                    changed = True
                    break
        changed = 1 if changed else 0
        df = pd.DataFrame([[year, CIK, changed, merged]], columns=['Year', 'CIK', 'V6', 'V9'])
        csv = csv.append(df)
csv
        

Unnamed: 0,Year,CIK,V6,V9
0,2007,1373835,0,0
0,2008,1373835,1,0
0,2009,1373835,1,0
0,2010,1373835,0,0
0,2011,1373835,1,0
0,2012,1373835,1,0
0,2013,1373835,0,0
0,2014,1373835,1,0
0,2015,1373835,0,0
0,2016,1373835,0,0
