**IMPORTING NECESSARY LIBRARIES**

In [1]:
import os
import re
import string
import pandas as pd

import syllapy

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

**STOPWORDS AND PUNCTUATIONS**

In [2]:
# stopwords 
stop = stopwords.words('english')
print(stop)

# punctuations
punctuations = list(string.punctuation)
print(punctuations)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
# getting list of files in StopWords directory
files = os.listdir('Input_data/StopWords')
print(files)

['StopWords_Names.txt', 'StopWords_Auditor.txt', 'StopWords_Geographic.txt', 'StopWords_Generic.txt', 'StopWords_Currencies.txt', 'StopWords_DatesandNumbers.txt', 'StopWords_GenericLong.txt']


In [4]:
stopwords = []
for file in files:
    path = 'Input_data/StopWords/' + file
    print('----------path : ', path)
    try:
        with open(path, 'r', encoding='UTF-8') as f:
            words = f.readlines()
            print(len(words))
            for i in words:
                stopwords.append(i.split()[0].strip('\n').lower())
        
    except UnicodeDecodeError:  # for file StopWords_Currencies.txt
        with open(path, 'r', encoding='latin-1') as f:
            words = f.readlines()
            print(len(words))
            for i in words:
                stopwords.append(i.split('|')[0].strip().lower())

print('length of stopwords : ', len(stopwords))


----------path :  Input_data/StopWords/StopWords_Names.txt
13014
----------path :  Input_data/StopWords/StopWords_Auditor.txt
8
----------path :  Input_data/StopWords/StopWords_Geographic.txt
199
----------path :  Input_data/StopWords/StopWords_Generic.txt
121
----------path :  Input_data/StopWords/StopWords_Currencies.txt
85
----------path :  Input_data/StopWords/StopWords_DatesandNumbers.txt
109
----------path :  Input_data/StopWords/StopWords_GenericLong.txt
571
length of stopwords :  14107


In [5]:
stopwords = stopwords + stop + punctuations
print(len(stopwords))

14318


**MASTERDICTIONARY**

In [6]:
# list of files in MasterDictionary directory
files = os.listdir('Input_data/MasterDictionary')
print(files)

['positive-words.txt', 'negative-words.txt']


In [7]:
positive_words, negative_words = [], []

path = 'Input_data/MasterDictionary/positive-words.txt'
with open(path, 'r', encoding='latin-1') as f:
    words = f.readlines()
    for i in words:
        positive_words.append(i.rstrip('\n').lower())
print('num of positive words : ', len(positive_words))


path = 'Input_data/MasterDictionary/negative-words.txt'
with open(path, 'r', encoding='latin-1') as f:
    words = f.readlines()
    for i in words:
        negative_words.append(i.rstrip('\n').lower())
print('num of negative words : ', len(negative_words))

num of positive words :  2006
num of negative words :  4783


In [8]:
# stopwords ---> contains list of stopwords from nltk library, punctuations, and stopwords provided
# positive_words ---> contains list of positive words
# negative_words ---> contains list of negative words

**FUNCTIONS**

In [9]:
# function to clean words
def clean(words):
    clean_word = [w for w in words if not w.lower() in stopwords]
    return clean_word


# function to calculate positive score, negative score, polarity score and subjectivity score
def sentimental_analysis(clean_word):
    result = []
    positive_score, negative_score = 0, 0

    for i in clean_word:
        if i.lower() in positive_words:
            positive_score += 1
        elif i.lower() in negative_words:
            negative_score += 1

    result.append(positive_score)
    result.append(negative_score)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / ((len(clean_word)) + 0.000001)

    result.append(polarity_score)
    result.append(subjectivity_score)

    return result


# function to count syllables
def count_syllables(word):     
    return syllapy.count(word)


# function to count complex word
def count_complex_words(words):
    count = 0
    for word in words:
        n = count_syllables(word.lower())
        if n > 2:
            count += 1
    return count


# function for readability analysis (average sentence length, percentage of complex words, and fog index)
def readability(words, sentences, cnt_complex_word):
    avg_sentence_len = len(words) / len(sentences)
    percent_complex_word = cnt_complex_word / len(words)
    fog_index = 0.4 * (avg_sentence_len + percent_complex_word)

    return [avg_sentence_len, percent_complex_word, fog_index]


# funtion to count total syllables
def count_total_syllable(words):
    count = 0
    for word in words:
        cnt = count_syllables(word.lower())
        count += cnt
    return count


# function to count personal pronouns
def count_personal_pronouns(text):
    pattern = r'\b(I|we|my|ours|us)\b'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    matches = regex.findall(text)
    count = len(matches)
    return count


# function to calculate average word length
def calculate_avg_word_length(words):
    cnt = 0
    for word in words:
        cnt += len(word)
    
    return cnt / len(words)

**FOR ONE FILE**

In [10]:
# access the data scrapped
path = 'Text_File/' + 'blackassign0001.txt'
with open(path, 'r') as f:
    content = f.readlines()
    content[0] = content[0].rstrip('\n')
    # print(content)
    text = ' '.join(content)
    print(text)

Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. We have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040. Rising IT cities Kolkata:- Kolkata in West Bengal is an emerging major IT hub. The new Kolkata i.e. Saltlake Sector  5, New town, Rajarhat area of Kolkata is a major IT hub. The government is giving the software companies land at almost free of cost to set up the companies there. Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here. Kolkata

In [11]:
token_sent = sent_tokenize(text)
token_word = word_tokenize(text)
print(token_sent, '\n', len(token_sent))
print(token_word, '\n', len(token_word))

['Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040.', 'We have seen a huge development and dependence of people on technology in recent years.', 'We have also seen the development of AI and ChatGPT in recent years.', 'So it is a normal thing that we will become fully dependent on technology by 2040.', 'Information technology will be a major power for all the developing nations.', 'As a member of a developing nation, India is rapidly growing its IT base.', 'It has also grown some IT cities which will be the major control centres for Information technology by 2040.', 'Rising IT cities Kolkata:- Kolkata in West Bengal is an emerging major IT hub.', 'The new Kolkata i.e.', 'Saltlake Sector\xa0 5, New town, Rajarhat area of Kolkata is a major IT hub.', 'The government is giving the software companies land at almost free of cost to set up the companies there.', 'Many large companies like Google, Microsoft, IBM, Infosys and others hav

In [12]:
# removing stop words from token words
# clean_word = [w for w in token_word if not w.lower() in stopwords]
clean_word = clean(token_word)
print('clean_word : ', len(clean_word))
print(clean_word)

clean_word :  149
['Rising', 'cities', 'impact', 'economy', 'environment', 'infrastructure', 'life', '2040', 'huge', 'development', 'dependence', 'people', 'technology', 'recent', 'years', 'development', 'ChatGPT', 'recent', 'years', 'normal', 'thing', 'fully', 'dependent', 'technology', '2040', 'Information', 'technology', 'developing', 'member', 'developing', 'rapidly', 'growing', 'base', 'grown', 'cities', 'control', 'centres', 'Information', 'technology', '2040', 'Rising', 'cities', 'Kolkata', 'Kolkata', 'Bengal', 'emerging', 'hub', 'Kolkata', 'i.e', 'Saltlake', 'Sector', '5', 'town', 'Rajarhat', 'area', 'Kolkata', 'hub', 'government', 'giving', 'software', 'companies', 'cost', 'set', 'companies', 'companies', 'Google', 'Microsoft', 'IBM', 'Infosys', 'set', 'companies', 'Kolkata', 'market', 'base', 'billions', 'dollars', 'great', 'job', 'boosting', 'national', 'economy', 'Impact', 'Economy', 'huge', 'impact', 'rising', 'cities', 'economy', 'effects', 'are-', 'Impact', 'Environment'

In [13]:
# calculate positive score, negative score, polarity score and subjectivity score
score = sentimental_analysis(clean_word)
print(score)

POSITIVE_SCORE = score[0]
NEGATIVE_SCORE = score[1]
POLARITY_SCORE = score[2]
SUBJECTIVITY_SCORE = score[3]

[2, 1, 0.33333322222225925, 0.020134228052790415]


In [14]:
# counting number of complex words
COMPLEX_WORDS = count_complex_words(clean_word)
print(COMPLEX_WORDS)

48


In [15]:
# Average Sentence Length = the number of words / the number of sentences
# Percentage of Complex words = the number of complex words / the number of words 
# Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

# calculating average sentence length, percentage of complex words, and fog index
results = readability(clean_word, token_sent, COMPLEX_WORDS)
print(results)

AVERAGE_SENTENCE_LENGTH = results[0]
PERCENTAGE_OF_COMPLEX_WORDS = results[1]
FOG_INDEX = results[2]

[5.96, 0.3221476510067114, 2.512859060402685]


In [16]:
# Average Number of Words Per Sentence = the total number of words / the total number of sentences
AVERAGE_NUMBER_OF_WORDS_PER_SENTENCE = len(token_word) / len(token_sent)
print(AVERAGE_NUMBER_OF_WORDS_PER_SENTENCE)

15.8


In [17]:
# word count is total cleaned words present in the text 
WORD_COUNT = len(clean_word)
print(WORD_COUNT)

149


In [18]:
# counting number of syllables
SYLLABLE_COUNT_PER_WORD = count_total_syllable(clean_word)
print(SYLLABLE_COUNT_PER_WORD)

321


In [19]:
# calculating personal pronouns
PERSONAL_PRONOUNS = count_personal_pronouns(text)
print(PERSONAL_PRONOUNS)

4


In [20]:
# calculating average word length
AVERAGE_WORD_LENGTH = calculate_avg_word_length(clean_word)
print(AVERAGE_WORD_LENGTH)

6.651006711409396


**FOR ALL TEXT FILES**

In [21]:
# list of all the file id in input.xlsx file
input = pd.read_excel('Input_data/Input.xlsx')
url_id = input['URL_ID'].to_list()
urls = input['URL'].to_list()
print(len(url_id), url_id)
print(len(urls), urls)

100 ['blackassign0001', 'blackassign0002', 'blackassign0003', 'blackassign0004', 'blackassign0005', 'blackassign0006', 'blackassign0007', 'blackassign0008', 'blackassign0009', 'blackassign0010', 'blackassign0011', 'blackassign0012', 'blackassign0013', 'blackassign0014', 'blackassign0015', 'blackassign0016', 'blackassign0017', 'blackassign0018', 'blackassign0019', 'blackassign0020', 'blackassign0021', 'blackassign0022', 'blackassign0023', 'blackassign0024', 'blackassign0025', 'blackassign0026', 'blackassign0027', 'blackassign0028', 'blackassign0029', 'blackassign0030', 'blackassign0031', 'blackassign0032', 'blackassign0033', 'blackassign0034', 'blackassign0035', 'blackassign0036', 'blackassign0037', 'blackassign0038', 'blackassign0039', 'blackassign0040', 'blackassign0041', 'blackassign0042', 'blackassign0043', 'blackassign0044', 'blackassign0045', 'blackassign0046', 'blackassign0047', 'blackassign0048', 'blackassign0049', 'blackassign0050', 'blackassign0051', 'blackassign0052', 'blacka

In [22]:
# file 'Output Data Structure.xlsx' represents the structure for output
output = pd.read_excel(r'Output_format/Output Data Structure.xlsx')
print(output.shape)
cols = list(output.columns)
print(cols)

(100, 15)
['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']


In [23]:
# creating dataframe
df = pd.DataFrame(columns=cols)
print(len(df))

0


In [24]:
# accessing all files in folder 'Text_File'
files = os.listdir('Text_File')
print(len(files), files)

98 ['blackassign0019.txt', 'blackassign0031.txt', 'blackassign0025.txt', 'blackassign0024.txt', 'blackassign0030.txt', 'blackassign0018.txt', 'blackassign0026.txt', 'blackassign0032.txt', 'blackassign0033.txt', 'blackassign0027.txt', 'blackassign0023.txt', 'blackassign0037.txt', 'blackassign0022.txt', 'blackassign0034.txt', 'blackassign0020.txt', 'blackassign0008.txt', 'blackassign0009.txt', 'blackassign0021.txt', 'blackassign0035.txt', 'blackassign0091.txt', 'blackassign0085.txt', 'blackassign0052.txt', 'blackassign0046.txt', 'blackassign0047.txt', 'blackassign0053.txt', 'blackassign0084.txt', 'blackassign0090.txt', 'blackassign0086.txt', 'blackassign0092.txt', 'blackassign0079.txt', 'blackassign0045.txt', 'blackassign0051.txt', 'blackassign0050.txt', 'blackassign0044.txt', 'blackassign0078.txt', 'blackassign0093.txt', 'blackassign0087.txt', 'blackassign0083.txt', 'blackassign0097.txt', 'blackassign0040.txt', 'blackassign0054.txt', 'blackassign0068.txt', 'blackassign0069.txt', 'blacka

In [25]:
# calculating all parameters and adding into dataframe
for id, link in zip(url_id, urls):
    filename = str(id) + '.txt'
    if filename not in files:
        new_row = {
            'URL_ID': id,
            'URL': link,
            'POSITIVE SCORE': -1,
            'NEGATIVE SCORE': -1, 
            'POLARITY SCORE': -1, 
            'SUBJECTIVITY SCORE': -1, 
            'AVG SENTENCE LENGTH': -1, 
            'PERCENTAGE OF COMPLEX WORDS': -1, 
            'FOG INDEX': -1, 
            'AVG NUMBER OF WORDS PER SENTENCE': -1, 
            'COMPLEX WORD COUNT': -1, 
            'WORD COUNT': -1, 
            'SYLLABLE PER WORD': -1, 
            'PERSONAL PRONOUNS': -1, 
            'AVG WORD LENGTH':-1 
        }
        df.loc[len(df)] = new_row
        continue

    path = 'Text_File/' + filename
    with open(path, 'r') as f:
        content = f.readlines()
        content[0] = content[0].rstrip('\n')
        text = ' '.join(content)

    # tokenization
    token_sent = sent_tokenize(text)
    token_word = word_tokenize(text)

    # removing stopwords
    clean_word = clean(token_word)
    
    # calculating
    score = sentimental_analysis(clean_word)
    POSITIVE_SCORE = score[0]
    NEGATIVE_SCORE = score[1]
    POLARITY_SCORE = score[2]
    SUBJECTIVITY_SCORE = score[3]

    COMPLEX_WORDS = count_complex_words(clean_word)

    results = readability(clean_word, token_sent, COMPLEX_WORDS)
    AVERAGE_SENTENCE_LENGTH = results[0]
    PERCENTAGE_OF_COMPLEX_WORDS = results[1]
    FOG_INDEX = results[2]

    AVERAGE_NUMBER_OF_WORDS_PER_SENTENCE = len(token_word) / len(token_sent)

    WORD_COUNT = len(clean_word)

    SYLLABLE_COUNT_PER_WORD = count_total_syllable(clean_word)

    PERSONAL_PRONOUNS = count_personal_pronouns(text)

    AVERAGE_WORD_LENGTH = calculate_avg_word_length(clean_word)

    # adding into dataframe
    new_row = {
        'URL_ID': id,
        'URL': link,
        'POSITIVE SCORE': POSITIVE_SCORE,
        'NEGATIVE SCORE': NEGATIVE_SCORE, 
        'POLARITY SCORE': POLARITY_SCORE, 
        'SUBJECTIVITY SCORE': SUBJECTIVITY_SCORE, 
        'AVG SENTENCE LENGTH': AVERAGE_SENTENCE_LENGTH, 
        'PERCENTAGE OF COMPLEX WORDS': PERCENTAGE_OF_COMPLEX_WORDS, 
        'FOG INDEX': FOG_INDEX, 
        'AVG NUMBER OF WORDS PER SENTENCE': AVERAGE_NUMBER_OF_WORDS_PER_SENTENCE, 
        'COMPLEX WORD COUNT': COMPLEX_WORDS, 
        'WORD COUNT': WORD_COUNT, 
        'SYLLABLE PER WORD': SYLLABLE_COUNT_PER_WORD, 
        'PERSONAL PRONOUNS': PERSONAL_PRONOUNS, 
        'AVG WORD LENGTH':AVERAGE_WORD_LENGTH 
    }

    df.loc[len(df)] = new_row
    print(df.shape, '\t', path, '\t', 'completed....')

(1, 15) 	 Text_File/blackassign0001.txt 	 completed....
(2, 15) 	 Text_File/blackassign0002.txt 	 completed....
(3, 15) 	 Text_File/blackassign0003.txt 	 completed....
(4, 15) 	 Text_File/blackassign0004.txt 	 completed....
(5, 15) 	 Text_File/blackassign0005.txt 	 completed....
(6, 15) 	 Text_File/blackassign0006.txt 	 completed....
(7, 15) 	 Text_File/blackassign0007.txt 	 completed....
(8, 15) 	 Text_File/blackassign0008.txt 	 completed....
(9, 15) 	 Text_File/blackassign0009.txt 	 completed....
(10, 15) 	 Text_File/blackassign0010.txt 	 completed....
(11, 15) 	 Text_File/blackassign0011.txt 	 completed....
(12, 15) 	 Text_File/blackassign0012.txt 	 completed....
(13, 15) 	 Text_File/blackassign0013.txt 	 completed....
(14, 15) 	 Text_File/blackassign0014.txt 	 completed....
(15, 15) 	 Text_File/blackassign0015.txt 	 completed....
(16, 15) 	 Text_File/blackassign0016.txt 	 completed....
(17, 15) 	 Text_File/blackassign0017.txt 	 completed....
(18, 15) 	 Text_File/blackassign0018.txt

In [26]:
# saving output
df.to_excel('Output.xlsx', index=False)