In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from textblob import TextBlob
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import string
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia=SentimentIntensityAnalyzer()
stop=stopwords.words("english")

In [2]:
def get_cleaned(text_tokens):
    clean_wordTokens=[x for x in text_tokens if x.upper() not in stopwords]
    return clean_wordTokens
    
def get_positiveScore(cleaned_text):
    positive_score=0
    for i in cleaned_text:
        if i in positive:
            positive_score+=1
    return positive_score

def get_negativeScore(cleaned_text):
    negative_score=0
    for i in cleaned_text:
        if i in negative:
            negative_score+=1
    return negative_score

def get_polarity(pos,neg):
    polarity=(pos - neg)/ ((pos + neg) + 0.000001)
    return polarity

def get_subjectivity(pos,neg,clean_textTokens):
    subjectivity=(positive_score + negative_score)/ ((len(clean_textTokens)) + 0.000001)
    return subjectivity

def get_avg_wordPersent(word_tokens,sent_tokens):
    avg_wordPersent=len(word_tokens)/len(sent_tokens)
    return avg_wordPersent

def get_complexWords(cleaned_text):
    vowels=['a','e','i','o','u']
    count=0
    complex_Word_Count=0
    for i in word_tokens:
        x=re.compile('[es|ed]$')
        if x.match(i.lower()):
            count+=0
        else:
            for j in i:
                if(j.lower() in vowels ):
                    count+=1
        if(count>2):
            complex_Word_Count+=1
        count=0
    return complex_Word_Count

def get_syallableCount(word_tokens):
    vowels=['a','e','i','o','u']
    count=0
    for i in word_tokens:
        x=re.compile('[es|ed]$')
        if x.match(i.lower()):
            count+=0
        else:
            for j in i:
                if(j.lower() in vowels ):
                    count+=1
    syllable_count=count
    return syllable_count

def get_avg_wordLen(clean_wordtokens):
    words =[x for x in clean_wordtokens if x not in string.punctuation]
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words > 0:
        avg_word_length = total_characters / total_words
    else:
        avg_word_length = 0
    
    return avg_word_length

def get_wordCount(cleantokens):
    words =[x for x in cleantokens if x not in string.punctuation]
    return len(words)

def get_pronouns(text):
    pronounRegex = re.compile(r'I|we|my|ours|us|you|they',re.I)
    pronouns = pronounRegex.findall(str(text))
    for i in pronouns:
        if i=="US":
            pronouns.remove("US")
    return len(pronouns)

def average_sentence_length(sent_tokens):
    total_characters = sum(len(sentence) for sentence in sent_tokens)
    total_sentences = len(sent_tokens)
    if total_sentences > 0:
        avg_sentence_length = total_characters / total_sentences
    else:
        avg_sentence_length = 0
    return avg_sentence_length

def get_complexPercentage(complex_word,word_tokens):
    percentage_complex=complex_word/len(word_tokens)
    return percentage_complex

def get_fogIndex(avg_sentLength,complex_percent):
    fog_index= 0.4*(avg_sentLength+complex_percent)
    return fog_index

In [3]:
data=pd.read_excel("Input.xlsx")
positive=pd.read_csv("positive-words.txt",header=None,index_col=False).values
negative=pd.read_csv("negative-words.txt",encoding="ISO-8859-1",header=None).values
stop_auditor=pd.read_csv("stopwords/StopWords_Auditor.txt",header=None).values
stop_currency=pd.read_csv("stopwords/StopWords_Currencies.txt",sep='|',encoding="ISO-8859-1",header=None).values
stop_datesAndnum=pd.read_csv("stopwords/StopWords_DatesandNumbers.txt",header=None).values
stop_generic=pd.read_csv("stopwords/StopWords_Generic.txt",header=None).values
stop_genericLong=pd.read_csv("stopwords/StopWords_GenericLong.txt",header=None).values
stop_names=pd.read_csv("stopwords/StopWords_Names.txt",sep='|',header=None)
stop_names=stop_names.drop(1,axis=1)
stop_names=stop_names.values

In [4]:
stop_auditor=stop_auditor.reshape(8,)
stop_currency=stop_currency.reshape(170,)
stop_datesAndnum=stop_datesAndnum.reshape(109,)
stop_generic=stop_generic.reshape(121,)
stop_genericLong=stop_genericLong.reshape(571,)
stop_names=stop_names.reshape(13014,)

stopwords=[]
for words in stop_auditor:
    stopwords.append(words)
for words in stop_currency:
    stopwords.append(words)
for words in stop_datesAndnum:
    stopwords.append(words)
for words in stop_generic:
    stopwords.append(words)
for words in stop_genericLong:
    stopwords.append(words)
for words in stop_names:
    stopwords.append(words)

In [5]:
driver=webdriver.Firefox()
list1=[]
output=[None]*100
for i in range(len(data)):
    try:
        driver.get(data["URL"][i])
        soup=BeautifulSoup(driver.page_source,"html.parser")
        try:
            content=soup.find("div","td-post-content tagdiv-type").text
        except Exception as e:
            content=soup.find_all("div","tdb-block-inner td-fix-index")
            for div in content:
                paragraphs = div.find_all('p')
                for paragraph in paragraphs:
                    list1.append(paragraph.get_text())
            content=str(list1)
        content=content.replace("\n","")
        content=content.replace("\xa0","")
        word_tokens=word_tokenize(str(content))
        sent_tokens=sent_tokenize(str(content))
        clean_wordTokens=get_cleaned(word_tokens)
    
        positive_score=get_positiveScore(clean_wordTokens)
        negative_score=get_negativeScore(clean_wordTokens)
        polarity=get_polarity(positive_score,negative_score)
        subjectivity_score=get_subjectivity(positive_score,negative_score,clean_wordTokens)
        avg_wordPersent=int(get_avg_wordPersent(word_tokens,sent_tokens))
        complex_words=get_complexWords(clean_wordTokens)
        syllable_count=get_syallableCount(word_tokens)
        avg_word_length=int(get_avg_wordLen(clean_wordTokens))
        word_count=get_wordCount(clean_wordTokens)
        pronouns=get_pronouns(clean_wordTokens)
        avg_sentLength=average_sentence_length(sent_tokens)
        percentage_complex=get_complexPercentage(complex_words,word_tokens)
        fog_index= get_fogIndex(avg_sentLength,percentage_complex)
        
        output[i]=[data["URL_ID"][i],data["URL"][i],positive_score,negative_score,polarity,subjectivity_score,avg_sentLength,
                  percentage_complex,fog_index,avg_wordPersent,complex_words,word_count,syllable_count,pronouns,avg_word_length]
    except Exception as e:
        print(i)
        print(e)

In [6]:
output

[['blackassign0001',
  'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/',
  37,
  5,
  0.7619047437641728,
  0.05809128622670638,
  116.67796610169492,
  0.20150943396226415,
  46.751790214262876,
  22,
  267,
  596,
  2179,
  298,
  6],
 ['blackassign0002',
  'https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/',
  58,
  29,
  0.33333332950191574,
  0.08537782130973717,
  150.1875,
  0.3055555555555556,
  60.19722222222222,
  25,
  506,
  835,
  3119,
  570,
  7],
 ['blackassign0003',
  'https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/',
  37,
  23,
  0.23333332944444451,
  0.0747198006541472,
  167.19565217391303,
  0.3582842724978974,
  67.02157457856438,
  25,
  426,
  668,
  2496,
  501,
  8],
 ['blackassign0004',
  'https://insights.blackcoffer.

In [7]:
cols=["URL_ID","URL","Positive Score","Negative Score","Polarity Score","Subjectivity Score","Average Sentence Length",
    "Percentage of Complex Words","Fog Index","Average Number of Words Per Sentence","Complex Word Count","Word Count","Syllable Per Word","Personal Pronouns","Average Word Length"]
info=pd.DataFrame(output,columns=cols)
info.to_excel("Output_Data.xlsx")