In [1]:
import pandas as pd
import requests
import re
import time

In [2]:
URL_PREFIX = "https://www.sec.gov/Archives/"
mda_re = r"item[^a-zA-Z\n]*\d\s*\.\s*management\'s discussion and analysis.*?^\s*item[^a-zA-Z\n]*\d\s*\.*"
qqd_re = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Quantitative and Qualitative Disclosures about " \
        r"Market Risk.*?^\s*item\s*\d\s*" 
rf_re = r"item[^a-zA-Z\n]*\d[a-z]?\.?\s*Risk Factors.*?^\s*item\s*\d\s*"

In [3]:
df = pd.read_excel('cik_list.xlsx')
df.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [4]:
time.sleep(5)
def get_mda(url):
        
    source = requests.get(f'{URL_PREFIX}/{url}').text.lower()
    mda = re.findall(mda_re, source, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if mda:
        return ''.join(mda)
    else:
        return 0
    
df['mda_text'] = df.SECFNAME.apply(get_mda)

In [5]:
def get_qqd(url):
    source = requests.get(f'{URL_PREFIX}/{url}').text
    qqd = re.findall(qqd_re, source, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if qqd:
        return ''.join(qqd)
    else:
        return 0
df['qqd_text'] = df.SECFNAME.apply(get_qqd)

In [6]:
def get_rf(url):
    source = requests.get(f'{URL_PREFIX}/{url}').text
    rf = re.findall(rf_re, source, re.IGNORECASE | re.DOTALL | re.MULTILINE)
    if rf:
        return ''.join(rf)
    else:
        return 0
df['rf_text'] = df.SECFNAME.apply(get_rf)

In [7]:
pd.set_option('display.max_rows', 152)
df

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,mda_text,qqd_text,rf_text
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,0,0,0
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,item 2. management's discussion and analysis ...,0,0
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,0,0,0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,item 7. management's discussion and analysis ...,0,0
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,0,0,0
5,3662,SUNBEAM CORP/FL/,199811,1998-11-25,10-Q/A,edgar/data/3662/0000950170-98-002278.txt,item 2. management's discussion and analysis ...,0,0
6,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002401.txt,0,0,0
7,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002402.txt,0,0,0
8,3662,SUNBEAM CORP/FL/,199903,1999-03-31,NT 10-K,edgar/data/3662/0000950172-99-000362.txt,0,0,0
9,3662,SUNBEAM CORP/FL/,199905,1999-05-11,10-K,edgar/data/3662/0000950170-99-000775.txt,item 7. management's discussion and analysis ...,0,0


# Section 1.1: Positive score, negative score, polarity score

In [8]:
stopwordsfile = 'stopwords.txt'
with open(stopwordsfile ,'r') as stop_words:
    stopwords = stop_words.read().lower()
stopwordlist = stopwords.split('\n')
stopwordlist[-1:] = []

In [9]:
from nltk.tokenize import RegexpTokenizer, sent_tokenize

def tokenizer(text):
    text = str(text).lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = list(filter(lambda token: token not in stopwordlist, tokens))
    return filtered_words

In [10]:
# Loading positive words
posfile = 'positive-words.txt'
with open(posfile,'r') as poswordfile:
    positivewords=poswordfile.read().lower()
positivewordlist=positivewords.split('\n')

In [12]:
# Calculating positive score 
def positive_score(text):
    numpositiveword = 0
    tokens = tokenizer(text)
    for word in tokens:
        if word in positivewordlist:
            numpositiveword  += 1
    
    posscore = numpositiveword
    return posscore

In [13]:
# Loading negative words
negFile = 'negative-words.txt'
with open(negFile ,'r') as negwordfile:
    negativeword = negwordfile.read().lower()
negativewordlist = negativeword.split('\n')

In [14]:
# Calculating Negative score
def negative_score(text):
    numnegativeword = 0
    Token = tokenizer(text)
    for word in Token:
        if word in negativewordlist:
            numnegativeword -= 1
    negscore = numnegativeword 
    nscore = negscore * -1
    return nscore

In [15]:
def polarity_score(positivescore, negativescore):
    pol_score = (positivescore - negativescore) / ((positivescore + negativescore) + 0.000001)
    return pol_score

# Section 2: Average Sentence Length, percentage of complex words, fog index

In [16]:
# Calculating Average sentence length 

def average_sentence_length(text):
    text = str(text).lower()
    sentence_list = sent_tokenize(text)
    tokens = tokenizer(text)
    totalwordcount = len(tokens)
    totalsentences = len(sentence_list)
    average_sent = 0
    if totalsentences != 0:
        average_sent = totalwordcount / totalsentences
    
    average_sent_length= average_sent
    
    return round(average_sent_length)

In [17]:
# Calculating Precentage of complex word

def percentage_complex_word(text):
    tokens = tokenizer(text)
    complexword = 0
    percentage = 0
    
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complexword += 1
    if len(tokens) != 0:
        percentage = complexword/len(tokens)
    
    return percentage

In [18]:
# calculating Fog Index 

def fog_index(averagesentencelength, percentagecomplexword):
    fogindex = 0.4 * (averagesentencelength + percentagecomplexword)
    return fogindex

# Section 3: Complex word count


In [19]:
# Counting complex words

def complex_word_count(text):
    tokens = tokenizer(text)
    complexwordcount = 0
    
    for word in tokens:
        vowels=0
        if word.endswith(('es','ed')):
            pass
        else:
            for w in word:
                if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                    vowels += 1
            if(vowels > 2):
                complexwordcount += 1
    return complexwordcount

In [20]:
#Counting total words

def total_word_count(text):
    tokens = tokenizer(text)
    return len(tokens)

In [21]:
# calculating uncertainty_score
uncertainty_dictionaryFile = 'uncertainty_dictionary.txt'
with open(uncertainty_dictionaryFile ,'r') as uncertain_dict:
    uncertainDict = uncertain_dict.read().lower()
uncertainDictionary = uncertainDict.split('\n')

def uncertainty_score(text):
    uncertainwordnum =0
    tokens = tokenizer(text)
    for word in tokens:
        if word in uncertainDictionary:
            uncertainwordnum +=1
    Uncertainityscore = uncertainwordnum 
    
    return Uncertainityscore

In [22]:
# calculating constraining score
constraining_dictionaryFile = 'constraining_dictionary.txt'
with open(constraining_dictionaryFile ,'r') as constraining_dict:
    constrainDict = constraining_dict.read().lower()
constrainDictionary = constrainDict.split('\n')

def constraining_score(text):
    constrainwordnum = 0
    tokens = tokenizer(text)
    for word in tokens:
        if word in constrainDictionary:
            constrainwordnum +=1
    Constrainscore = constrainwordnum 
    
    return Constrainscore

# Additional Variables: positive/negative and uncertainty/constraining word proportion 


In [23]:
# Calculating positive word proportion

def positive_word_prop(positivescore,wordcount):
    positive_word_proportion = 0
    if wordcount !=0:
        positive_word_proportion = positivescore / wordcount
        
    return positive_word_proportion

In [24]:
# Calculating negative word proportion

def negative_word_prop(negativescore,wordcount):
    negative_word_proportion = 0
    if wordcount !=0:
        negative_word_proportion = negativescore / wordcount
        
    return negative_word_proportion

In [25]:
# Calculating uncertain word proportion

def uncertain_word_prop(uncertainscore,wordcount):
    uncertain_word_proportion = 0
    if wordcount !=0:
        uncertain_word_proportion = uncertainscore / wordcount
        
    return uncertain_word_proportion

In [26]:
# Calculating constraining word proportion

def constraining_word_prop(constrainingscore,wordcount):
    constraining_word_proportion = 0
    if wordcount !=0:
        constraining_word_proportion = constrainingscore / wordcount
        
    return constraining_word_proportion

# Additional Variable: Constraining words for whole report

In [27]:
# calculating Constraining words for whole report
df[["mda_text", "qqd_text", "rf_text"]] = df[["mda_text", "qqd_text", "rf_text"]].astype(str) 
def constrain_word_whole(mda_text,qqd_text,rf_text):
    wholeDoc = mda_text + qqd_text + rf_text
    constrainwordwhole =0
    tokens = tokenizer(wholeDoc)
    for word in tokens:
        if word in constrainDictionary:
            constrainwordwhole +=1
    constrainscorewhole = constrainwordwhole 
    
    return constrainscorewhole

In [28]:
import numpy as np
df = pd.DataFrame(df)

df['mda_positive_score'] = df.mda_text.apply(positive_score)
df['mda_negative_score'] = df.mda_text.apply(negative_score)
df['mda_polarity_score'] = np.vectorize(polarity_score)(df['mda_positive_score'],df['mda_negative_score'])
df['mda_average_sentence_length'] = df.mda_text.apply(average_sentence_length)
df['mda_percentage_of_complex_words'] = df.mda_text.apply(percentage_complex_word)
df['mda_fog_index'] = np.vectorize(fog_index)(df['mda_average_sentence_length'],df['mda_percentage_of_complex_words'])
df['mda_complex_word_count']= df.mda_text.apply(complex_word_count)
df['mda_word_count'] = df.mda_text.apply(total_word_count)
df['mda_uncertainty_score']=df.mda_text.apply(uncertainty_score)
df['mda_constraining_score'] = df.mda_text.apply(constraining_score)
df['mda_positive_word_proportion'] = np.vectorize(positive_word_prop)(df['mda_positive_score'],df['mda_word_count'])
df['mda_negative_word_proportion'] = np.vectorize(negative_word_prop)(df['mda_negative_score'],df['mda_word_count'])
df['mda_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(df['mda_uncertainty_score'],df['mda_word_count'])
df['mda_constraining_word_proportion'] = np.vectorize(constraining_word_prop)(df['mda_constraining_score'],df['mda_word_count'])

In [29]:
df['qqdmr_positive_score'] = df.qqd_text.apply(positive_score)
df['qqdmr_negative_score'] = df.qqd_text.apply(negative_score)
df['qqdmr_polarity_score'] = np.vectorize(polarity_score)(df['qqdmr_positive_score'],df['qqdmr_negative_score'])
df['qqdmr_average_sentence_length'] = df.qqd_text.apply(average_sentence_length)
df['qqdmr_percentage_of_complex_words'] = df.qqd_text.apply(percentage_complex_word)
df['qqdmr_fog_index'] = np.vectorize(fog_index)(df['qqdmr_average_sentence_length'],df['qqdmr_percentage_of_complex_words'])
df['qqdmr_complex_word_count']= df.qqd_text.apply(complex_word_count)
df['qqdmr_word_count'] = df.qqd_text.apply(total_word_count)
df['qqdmr_uncertainty_score']=df.qqd_text.apply(uncertainty_score)
df['qqdmr_constraining_score'] = df.qqd_text.apply(constraining_score)
df['qqdmr_positive_word_proportion'] = np.vectorize(positive_word_prop)(df['qqdmr_positive_score'],df['qqdmr_word_count'])
df['qqdmr_negative_word_proportion'] = np.vectorize(negative_word_prop)(df['qqdmr_negative_score'],df['qqdmr_word_count'])
df['qqdmr_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(df['qqdmr_uncertainty_score'],df['qqdmr_word_count'])
df['qqdmr_constraining_word_proportion'] = np.vectorize(constraining_word_prop)(df['qqdmr_constraining_score'],df['qqdmr_word_count'])


df['rf_positive_score'] = df.rf_text.apply(positive_score)
df['rf_negative_score'] = df.rf_text.apply(negative_score)
df['rf_polarity_score'] = np.vectorize(polarity_score)(df['rf_positive_score'],df['rf_negative_score'])
df['rf_average_sentence_length'] = df.rf_text.apply(average_sentence_length)
df['rf_percentage_of_complex_words'] = df.rf_text.apply(percentage_complex_word)
df['rf_fog_index'] = np.vectorize(fog_index)(df['rf_average_sentence_length'],df['rf_percentage_of_complex_words'])
df['rf_complex_word_count']= df.rf_text.apply(complex_word_count)
df['rf_word_count'] = df.rf_text.apply(total_word_count)
df['rf_uncertainty_score']=df.rf_text.apply(uncertainty_score)
df['rf_constraining_score'] = df.rf_text.apply(constraining_score)
df['rf_positive_word_proportion'] = np.vectorize(positive_word_prop)(df['rf_positive_score'],df['rf_word_count'])
df['rf_negative_word_proportion'] = np.vectorize(negative_word_prop)(df['rf_negative_score'],df['rf_word_count'])
df['rf_uncertainty_word_proportion'] = np.vectorize(uncertain_word_prop)(df['rf_uncertainty_score'],df['rf_word_count'])
df['rf_constraining_word_proportion'] = np.vectorize(constraining_word_prop)(df['rf_constraining_score'],df['rf_word_count'])

df['constraining_words_whole_report'] = np.vectorize(constrain_word_whole)(df['mda_text'],df['qqd_text'],df['rf_text'])

In [30]:
pd.set_option('display.max_columns', 52)
df

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,mda_text,qqd_text,rf_text,mda_positive_score,mda_negative_score,mda_polarity_score,mda_average_sentence_length,mda_percentage_of_complex_words,mda_fog_index,mda_complex_word_count,mda_word_count,mda_uncertainty_score,mda_constraining_score,mda_positive_word_proportion,mda_negative_word_proportion,mda_uncertainty_word_proportion,mda_constraining_word_proportion,qqdmr_positive_score,qqdmr_negative_score,qqdmr_polarity_score,qqdmr_average_sentence_length,qqdmr_percentage_of_complex_words,qqdmr_fog_index,qqdmr_complex_word_count,qqdmr_word_count,qqdmr_uncertainty_score,qqdmr_constraining_score,qqdmr_positive_word_proportion,qqdmr_negative_word_proportion,qqdmr_uncertainty_word_proportion,qqdmr_constraining_word_proportion,rf_positive_score,rf_negative_score,rf_polarity_score,rf_average_sentence_length,rf_percentage_of_complex_words,rf_fog_index,rf_complex_word_count,rf_word_count,rf_uncertainty_score,rf_constraining_score,rf_positive_word_proportion,rf_negative_word_proportion,rf_uncertainty_word_proportion,rf_constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,item 7. management's discussion and analysi...,0,0,59,63,-0.032787,27,0.374299,10.94972,868,2319,30,11,0.025442,0.027167,0.012937,0.004743,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,11
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,item 2. management's discussion and analysis ...,0,0,34,57,-0.252747,30,0.425174,12.17007,733,1724,50,3,0.019722,0.033063,0.029002,0.00174,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,3
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,0,0,0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,item 7. management's discussion and analysis ...,0,0,132,107,0.104603,25,0.406752,10.162701,1771,4354,72,50,0.030317,0.024575,0.016537,0.011484,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,50
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,0,0,0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0
5,3662,SUNBEAM CORP/FL/,199811,1998-11-25,10-Q/A,edgar/data/3662/0000950170-98-002278.txt,item 2. management's discussion and analysis ...,0,0,79,80,-0.006289,23,0.42524,9.370096,1331,3130,78,29,0.02524,0.025559,0.02492,0.009265,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,29
6,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002401.txt,item 2. management's discussion and analysis ...,0,0,106,127,-0.090129,23,0.412426,9.36497,1872,4539,100,41,0.023353,0.02798,0.022031,0.009033,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,41
7,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002402.txt,item 2.management's discussion and analysis of...,0,0,103,126,-0.100437,22,0.415844,8.966338,1811,4355,99,39,0.023651,0.028932,0.022732,0.008955,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,39
8,3662,SUNBEAM CORP/FL/,199903,1999-03-31,NT 10-K,edgar/data/3662/0000950172-99-000362.txt,0,0,0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0
9,3662,SUNBEAM CORP/FL/,199905,1999-05-11,10-K,edgar/data/3662/0000950170-99-000775.txt,0,0,0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0,0,0.0,1,0.0,0.4,0,1,0,0,0.0,0.0,0.0,0.0,0


In [31]:
writer = pd.ExcelWriter("Output Data Structure.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
workbook  = writer.book
worksheet = writer.sheets['Sheet1']
writer.save()

In [32]:
df.shape

(152, 52)