In [None]:
# Importing necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import urllib.request 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import cmudict
import string


# Mounting my google drive which will be used for the necessary files
from google.colab import drive
drive.mount('/content/drive',force_remount=True)


# The directory containing the necessary files for this program
%cd "drive/MyDrive/Blackcoffer"


# Making dataframes of the data so that we can manipulate and update them

# Input that was provided 
excel_pd = pd.read_excel('Input.xlsx')

# Publicly and universally accepted list of stop words which will help us find the stop words in any document
stop_pd = pd.read_csv('StopWords_Generic.txt')

# Final excel sheet with the scores of each of the entries in the input excel
final_excel = pd.read_csv('Output Data Structure.xlsx')

# This excel sheet contains all the words and states if a word is a negative or positive word and so on. 
Words = pd.read_excel('LoughranMcDonald_MasterDictionary_2020.xlsx')



# Creating two data frames for further use 

# Dataframe which will store the Positive words in a particular website
Positive = pd.DataFrame()

# Dataframe which will store the Negative words in a particular website
Negative = pd.DataFrame()




# We now store all the positive words from dataframe 'Words' onto the newly made dataframe 'Positive'. If the column(in 'Words') corresponding to positive is non zero, then it is a positive word. 
j = 0
for i in range (len(Words)):
  if Words['Positive'][i] > 0:
    Positive.loc[j,0] = Words['Word'][i]
    j = j + 1

# We now store all the negative words from dataframe 'Words' onto the newly made dataframe 'Negative'. If the column(in 'Words') corresponding to negative is non zero, then it is a negative word. 
j = 0
for i in range (len(Words)):
  if Words['Negative'][i] > 0:
    Negative.loc[j,0] = Words['Word'][i]
    j = j + 1




# We now calculate all the scores by running a for loop iterating over all the rows(and hence each of the websites)

# We are creating a dictionary of all the words using this inbuilt library
syllables = dict(cmudict.entries())

# Iterating over the entire input excel sheet
for i in range (len(excel_pd)):

  # Corresponding to the row number (variable i), we select the website using get function
  html_text = requests.get('excel_pd.iloc[i][1]')

  # We use BeautifulSoup to get the content of the document
  soup = BeautifulSoup(html_text.content)

  # The condition for the text to be chosen is that it should be enclosed withing <p>....</p>. Hence this filter. Paras contain all the data enclosed within this filter
  paras = soup.find_all('p')

  # Each of these variables are assigned a zero value whenever a new iteration(and hence, whenever a new document) is encounterted
  word_count, positive_score, negative_score, sentences, complex_count, overall_syllables = 0

  # We now iterate over each paragraph of the document
  for para in paras:

    # We now use the inbuilt python function to split the sentences to iterate over each individual word. 
    for word in word_tokenize(para.text):
    
      # We capitalize each word because all the words in our dataframe are capital by default
      word = word.capitalize()

      # There is a flaw in this logic, but I am counting a sentence whenever we encounter the characters '!', '.' and '?'. Also, they shouldnt be counted for any of the other scores, hence continuing with the for loop
      if (word == '.' or '!' or '?'):
        sentences = sentences + 1
        continue

      # if the word is in the Stop dataframe (hence a stopword)
      if stop_pd.iloc[:,0].str.contains(word).any():
        continue

      # Counting the words which have passed through
      word_count = word_count + 1
      
      # Checking to see if the word is in the positive daraframe(and hence a positive word )
      if Positive.iloc[:,0].str.contains(word).any():
        positive_score = positive_score + 1

      # Checking to see if the word is in the negative daraframe(and hence a negative word )
      if Negative.iloc[:,0].str.contains(word).any():
        negative_score = negative_score + 1

      # Counting the number of syllables using the dict we had made 
      syllables_length = len( [ph for ph in syllables[word] if syllables.strip(string.letters)] )
      overall_syllables = overall_syllables + syllables_length

      # If the syllable count is above 2, then we count it as a complex word
      if syllables_length > 2:
        complex_count = complex_count + 1                      

    
  # We now fill the empty columns in the final_excel dataframe that we had created based on the scores we found and the formulas given
  final_excel.loc[i, 2] = positive_score
  final_excel.loc[i, 3] = negative_score
  final_excel.loc[i, 4] = (positive_score - negative_score) / ((positive_score + negative_score) + 0.0000001 )
  final_excel.loc[i, 5] = (positive_score + negative_score) / ((word_count) + 0.000001 )
  final_excel.loc[i, 7] = complex_count / word_count
  final_excel.loc[i, 8] = 0.4 * (final_excel.loc[i, 6]) + final_excel.loc[i, 7])
  final_excel.loc[i, 9], final_excel.loc[i, 6] = word_count / sentences
  final_excel.loc[i, 10] = complex_count
  final_excel.loc[i, 11] = word_count
  final_excel.loc[i, 12] = overall_syllables
  final_excel.loc[i, 14] = avg_word_length




# We now convert the final_excel dataframe to an excel sheet
final_excel.to_excel("Output Data Structure.xlsx")