In [1]:
#dependencies
#!pip install openpyxl

### Importing required library

In [2]:

import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import requests
from bs4 import BeautifulSoup
import re
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Loading the dataset

In [3]:
df=pd.read_excel('cik_list.xlsx')
df.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


### loading the master dictionary so that we can extract postive and negative word dictionary

In [4]:
master_dic = pd.read_excel('LoughranMcDonald_MasterDictionary_2018.xlsx')

### Adding the 'https://www.sec.gov/Archives/' in the SECFNAME

In [5]:
df['SECFNAME'] ='https://www.sec.gov/Archives/' + df['SECFNAME'].astype(str)
df.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,https://www.sec.gov/Archives/edgar/data/3662/0...
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,https://www.sec.gov/Archives/edgar/data/3662/0...
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,https://www.sec.gov/Archives/edgar/data/3662/0...


### getting all the links in new variable

In [6]:
links = df['SECFNAME']

### putting text of the all 152 file into reports

In [7]:
reports = []
for url in links:
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.152 Safari/537.36'})
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    reports.append(soup.get_text())

In [8]:
print(len(reports))

152


### #Loading the stop words

In [9]:
stop_words1=stopwords.words('english')

### Extracting the postive and negative word from master dictionary

In [10]:
positive_dictionary = [x.lower() for x in master_dic[master_dic['Positive'] != 0]['Word']]
negative_dictionary = [x.lower() for x in master_dic[master_dic['Negative'] != 0]['Word']]

In [11]:
print(len(positive_dictionary))
print(len(negative_dictionary))

354
2355


In [12]:
print(positive_dictionary[:5])
print(negative_dictionary[:5])

['able', 'abundance', 'abundant', 'acclaimed', 'accomplish']
['abandon', 'abandoned', 'abandoning', 'abandonment', 'abandonments']


### loading the uncertinity and constraing word dictionary

In [13]:
uncertainity = pd.read_excel('uncertainty_dictionary.xlsx')
#uncertainity_words = list(uncertainity['Word'])
uncertainity_words=[x.lower() for x in list(uncertainity['Word'])]

constraining = pd.read_excel('constraining_dictionary.xlsx')
constraining_words=[x.lower() for x in list(constraining['Word'])]


In [14]:
print(len(uncertainity_words))
print(len(constraining_words))

297
184


###  writing the function for tokenize, stop word removal, polarity, syllable count, fog index
### we are going to use it later

In [15]:
def tokenize(text):
    text = re.sub(r'[^A-Za-z]',' ',text.lower())
    tokenized_words = word_tokenize(text)
    return tokenized_words

def remove_stopwords1(words, stop_words1):
    return [x for x in words if x not in stop_words1]
    
def polarity(positive_score, negative_score):
     return (positive_score - negative_score)/((positive_score + negative_score)+ 0.000001)
def avgsentlength(total_word, sentence_legnth):
     return total_word/sentence_legnth
     
def syllable_morethan2(word):
    if(len(word) > 2 and (word[-2:] == 'es' or word[-2:] == 'ed')):
        return False
    
    count =0
    vowels = ['a','e','i','o','u']
    for i in word:
        if(i.lower() in vowels):
            count = count +1
        
    if(count > 2):
        return True
    else:
        return False
    
def fog_index_cal(average_sentence_length, percentage_complexwords):
    return 0.4*(average_sentence_length + percentage_complexwords)

### defining one main function and calling tokenize and remove stop word funtion

In [16]:
def main(reports):
    
    doc=list(tokenize(str(reports)))
    doc1=remove_stopwords1(doc,stop_words1)
    return doc1

### storing the word of all 152 documents into empty list. 
### The empty list is a nested lsited each list within list contains the word of each documents

In [17]:
empty=[]
for i in reports:
    result = main(i)
    empty.append(result)

### calculating the word length

In [18]:
word_length=[len(empty[i]) for i in range(len(empty))]
df['word_count'] = word_length

### calculating postive and negative score

In [19]:
pos_score=[]
neg_score=[]
for x in empty:
    negative_score = 0    
    positive_score = 0
    for d in x:
        if(d in negative_dictionary):
              negative_score = negative_score+1
                
        if(d in positive_dictionary):
              positive_score = positive_score+1
    neg_score.append(negative_score)
    pos_score.append(positive_score)
    
df['negative_score'] = pos_score
df['positive_score'] = neg_score

### calculating polarity_point

In [20]:
polarity_point=[polarity(x,y) for x, y in zip(pos_score, neg_score)]
df['polarity_point'] = polarity_point

### calculating average sentence lenght

In [21]:
#sentence tokenize
total_sent1=[]
for x in reports:
    total_sent=len(sent_tokenize(x))
    total_sent1.append(total_sent)
    
Avg_sent_lenght=[avgsentlength(x,y) for x, y in zip(word_length, total_sent1)]
    
df['Avg_sent_lenght'] = Avg_sent_lenght

### calculating No. of complex word

In [24]:


complex_word_list=[]

for x in empty:
    num_complexword =0
    for d in x:
        if(syllable_morethan2(d)):
            num_complexword = num_complexword+1
    complex_word_list.append(num_complexword)
     
df['complex_word_count'] = complex_word_list

### calculating percentage of complex word

In [26]:
percentage_complex_word=[x/y for x, y in zip(complex_word_list, word_length)]
df['percentage_of_complex_words'] = percentage_complex_word

### calculating uncertainty_score and constraining_score

In [27]:
uncer=[]
constr=[]
for x in empty:
    uncertainity_score = 0
    constraining_score = 0
    for d in x:
         if(d in uncertainity_words):
             uncertainity_score = uncertainity_score+1

         if(d in constraining_words):
             constraining_score = constraining_score+1
                
    uncer.append(uncertainity_score)
    constr.append(constraining_score)
df['uncertainty_score'] = uncer
df['constraining_score'] = constr          

### calculating Fog Index

In [28]:
fog_Score=[fog_index_cal(x,y) for x, y in zip(Avg_sent_lenght, percentage_complex_word)]
df['fog_index'] = fog_Score

### calculating the word proportion

In [29]:
## positive_word_proportion

positive_word_proportion=[x/y for x, y in zip(pos_score, word_length)]
df['positive_word_proportion'] = positive_word_proportion

## negative_word_proportion
negative_word_proportion=[x/y for x, y in zip(neg_score, word_length)]
df['negative_word_proportion'] = negative_word_proportion

## uncertainity_word_proportion
uncertainity_word_proportion=[x/y for x, y in zip(uncer, word_length)]
df['uncertainity_word_proportion'] = uncertainity_word_proportion

## constraining_word_proportion

constraining_word_proportion=[x/y for x, y in zip(constr, word_length)]
df['constraining_word_proportion'] = constraining_word_proportion


In [30]:
### calculating the constraining_words_whole_report

In [31]:
sum_of_total_word=sum(word_length)
sum_of_total_word

## constraining_word_proportion

constraining_words_whole_report=[x/sum_of_total_word for x in constr]
# constraining_words_whole_report=[x/y for x, y in zip(constr, sum_of_total_word)]
df['constraining_words_whole_report'] = constraining_words_whole_report
df['constraining_words_whole_report'] 

0      2.252099e-04
1      1.584194e-04
2      7.572628e-07
3      1.084400e-04
4      6.058102e-07
           ...     
147    2.211207e-05
148    4.543577e-07
149    3.786314e-06
150    2.196062e-05
151    5.906649e-06
Name: constraining_words_whole_report, Length: 152, dtype: float64

In [32]:
### Exporting to csv

In [34]:
df.to_csv('blackoffer_output.csv', index=False)