# Importing Libraries


In [None]:
import pandas as pd
import re
import numpy as np
import string
import time
from nltk.tokenize import sent_tokenize, RegexpTokenizer

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip3 install requests



In [None]:
import requests

# Importing cik List

In [None]:
cik_list = pd.read_csv('cik_list.csv')

# Data Extraction

In [None]:
def extract_data(link):
    link = 'https://www.sec.gov/Archives/' + link.strip()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
    f = requests.get(link ,headers=headers)
    text = f.text
    return text


In [None]:
report=cik_list['SECFNAME'].apply(extract_data)

In [None]:
def clean_data(text):
    #Remove HTML Tags
    text = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});','', text)
    
    #remove extra line and tabs
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')

    #remove punctuation 
#     text = text.translate(str.maketrans('', '', string.punctuation))

    # remove numbers and special characters
    text = re.sub(r'[^a-zA-z.,!?/:;\"\'\s]',' ',text)
    
    #remove multiple spaces
    text = re.sub('(?s) +',' ',text)
   
    return text

In [None]:
report_data = {'raw_text': report}
report_df = pd.DataFrame(report_data)
report_df['clean_data'] = report_df['raw_text'].apply(clean_data)

Stopwords

In [None]:
f = open("StopWords_Generic.txt", "r")
stop_words = f.read().lower()

In [None]:
stopWordList = stop_words.split('\n')

In [None]:
def tokenize_text(text):
    tokenizer = RegexpTokenizer(r'\w+') #removing punctuation
    tokens = tokenizer.tokenize(text.lower())
    filtered_words = list(filter(lambda token: token not in stopWordList, tokens)) # filtering stopwords
    return filtered_words

In [None]:
report_df['filtered'] = report_df['clean_data'].apply(tokenize_text)

In [None]:
positive_df = pd.read_csv('Positive-Table.csv')
negative_df = pd.read_csv('Negative-Table.csv')

In [None]:
print(positive_df)

             ABLE
0       ABUNDANCE
1        ABUNDANT
2       ACCLAIMED
3      ACCOMPLISH
4    ACCOMPLISHED
..            ...
348           WIN
349        WINNER
350       WINNERS
351       WINNING
352        WORTHY

[353 rows x 1 columns]


In [None]:
positiveWords = positive_df.iloc[:,0].apply(lambda x:x.lower())

In [None]:
print(positiveWords)

0         abundance
1          abundant
2         acclaimed
3        accomplish
4      accomplished
           ...     
348             win
349          winner
350         winners
351         winning
352          worthy
Name: ABLE, Length: 353, dtype: object


In [None]:
positiveWordsList = positiveWords.tolist()

In [None]:
negativeWords = negative_df.iloc[:,0].apply(lambda x:x.lower())

In [None]:
negativeWordsList = negativeWords.tolist()

In [None]:
positiveWordsList = list(filter(lambda word: word not in stopWordList, positiveWordsList))


In [None]:
negativeWordsList = list(filter(lambda word: word not in stopWordList, negativeWordsList))

Positive and Negative Scores

In [None]:
def positive_score(token):
    posWords = 0
    for word in token:
        if word in positiveWordsList:
            posWords  += 1
    return posWords

In [None]:
def negative_score(token):
    negWords=0
    for word in token:
        if word in negativeWordsList:
            negWords -=1
    return negWords*-1

In [None]:
def polarity_score(positiveScore, negativeScore):
    pol_score = (positiveScore - negativeScore) / ((positiveScore + negativeScore) + 0.000001)
    return pol_score

In [None]:
report_df['positive_score'] = report_df['filtered'].apply(positive_score)

In [None]:
report_df['negative_score'] = report_df['filtered'].apply(negative_score)

In [None]:
report_df['polarity_score'] = report_df.apply(lambda x: polarity_score(x.positive_score,x.negative_score),axis=1)

In [None]:
def average_sentence_length(text,word_token):
    sentence_token = sent_tokenize(text)
    totalWordCount = len(word_token)
    totalSentences = len(sentence_token)
    average_sent_length = 0
    if totalSentences != 0:
        average_sent_length = totalWordCount / totalSentences    
    return round(average_sent_length)

In [None]:

report_df['average_sentence_length'] = report_df.apply(lambda x: average_sentence_length(x.clean_data,x.filtered),axis=1)

In [None]:
def syllable_count(word):
    vowels = 0
    word = word.lower()
    if word.endswith(('es','ed')):
            pass
    else:
        for w in word:
            if(w=='a' or w=='e' or w=='i' or w=='o' or w=='u'):
                vowels += 1
    return vowels

In [None]:
def complex_word_count(token):
    complexWords = 0
    for word in token:
        if syllable_count(word) > 2:
            complexWords+=1
    return complexWords

In [None]:
def complex_word_percentage(token):
    totalWords = len(token)
    complexWords = complex_word_count(token)
    return complexWords/totalWords

In [None]:
report_df['percentage_of_complex_words'] = report_df['filtered'].apply(complex_word_percentage)

In [None]:
def fog_index(avg_sentence_length,percentage_complex):
    return 0.4*(avg_sentence_length+percentage_complex)

In [None]:
report_df['fog_index'] = report_df.apply(lambda x:fog_index(x.average_sentence_length,x.percentage_of_complex_words),axis=1)

In [None]:
report_df['word_count'] = report_df['filtered'].apply(lambda x:len(x))


In [None]:
report_df['complex_word_count'] = report_df['filtered'].apply(complex_word_count)

In [None]:

uncertainty_df = pd.read_csv('uncertainty_dictionary.csv')

In [None]:
uncertainWords = uncertainty_df['Word'].apply(lambda x:x.lower())
uncertainWordsList = uncertainWords.tolist()

In [None]:
def uncertainty_score(token):
    uncWords = 0
    for word in token:
        if word in uncertainWordsList:
            uncWords  += 1
    return uncWords

In [None]:
constraining_df = pd.read_csv('constraining_dictionary.csv')


In [None]:
print(constraining_df)

              ABIDE
0           ABIDING
1             BOUND
2           BOUNDED
3            COMMIT
4        COMMITMENT
..              ...
178        STRICTER
179       STRICTEST
180        STRICTLY
181  UNAVAILABILITY
182     UNAVAILABLE

[183 rows x 1 columns]


In [None]:
constrainWords = constraining_df.iloc[:,0].apply(lambda x:x.lower())
constrainWordsList = constrainWords.tolist()

In [None]:
def constraining_score(token):
    constrainWords = 0
    for word in token:
        if word in constrainWordsList:
            constrainWords  += 1
    return constrainWords

In [None]:
report_df['uncertainty_score'] = report_df['filtered'].apply(uncertainty_score)

In [None]:
report_df['constraining_score'] = report_df['filtered'].apply(constraining_score)

In [None]:
def positive_word_proportion(positiveScore,wordcount):
    pwp = 0
    if wordcount !=0:
        pwp = positiveScore / wordcount
    return pwp

In [None]:
def negative_word_proportion(negativeScore,wordcount):
    nwp = 0
    if wordcount !=0:
        nwp = negativeScore / wordcount
    return nwp

In [None]:
report_df['positive_word_proportion'] = report_df.apply(lambda x:positive_word_proportion(x.positive_score,x.word_count),axis=1)

In [None]:
report_df['negative_word_proportion'] = report_df.apply(lambda x:negative_word_proportion(x.negative_score,x.word_count),axis=1)

In [None]:
def uncertain_word_proportion(uncertainScore,wordcount):
    uwp = 0
    if wordcount !=0:
        uwp = uncertainScore / wordcount
    return uwp

In [None]:
def constrain_word_proportion(constrainScore,wordcount):
    cwp = 0
    if wordcount !=0:
        cwp = constrainScore / wordcount
    return cwp

In [None]:
report_df['uncertainty_word_proportion'] = report_df.apply(lambda x:uncertain_word_proportion(x.uncertainty_score,x.word_count),axis=1)

In [None]:
report_df['constraining_word_proportion'] = report_df.apply(lambda x:constrain_word_proportion(x.constraining_score,x.word_count),axis=1)

In [None]:
report_df['constraining_words_whole_report'] = report_df['filtered'].apply(constraining_score)

In [None]:
final_report = cik_list.join(report_df.iloc[:,3:])

In [None]:
final_report.to_csv('Output.csv')