## Text Extraction

In [1]:
import pandas as pd

In [2]:
import openpyxl
import os
import requests
from bs4 import BeautifulSoup

# Specify the input file path
input_file = 'Input.xlsx'

# Load the workbook and select the active worksheet
workbook = openpyxl.load_workbook(input_file)
worksheet = workbook.active

# Get the index of the columns containing the link and URL ID
link_column = None
url_id_column = None
for col_idx, cell in enumerate(worksheet[1]):
    if cell.value == 'URL':
        link_column = col_idx
    elif cell.value == 'URL_ID':
        url_id_column = col_idx
if link_column is None or url_id_column is None:
    raise ValueError('Could not find columns for link and URL ID')

if not os.path.exists('output'):
    os.makedirs('output')

# Loop over each row in the worksheet and extract the link and URL ID
for row in worksheet.iter_rows(min_row=2, values_only=True):
    try:
        link = row[link_column]
        url_id = row[url_id_column]

        # Send a GET request to the URL and save the content to a file
        response = requests.get(link)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title_element = soup.find('h1', {'class': 'entry-title'})
        title = title_element.text.strip()

        # Extract the text content
        article_body = soup.find('div', {'class': 'td-post-content tagdiv-type'})
        paragraphs = article_body.find_all('p')
        text = '\n'.join([p.text.strip() for p in paragraphs])

        # Save the title and text content to a file
        with open(os.path.join('output', str(url_id) + '.txt'), 'w', encoding='utf-8') as f:
            f.write(title + '\n\n' + text)
    except Exception as e:
        print(f"Error processing URL: {link}. {str(e)}")

Error processing URL: https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/future-of-work-how-ai-has-entered-the-workplace/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/human-rights-outlook/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/how-voice-search-makes-your-business-a-successful-business/. 'NoneType' object has no attribute 'text'
Error processing URL: https://insights.blackcoffer.com/estimating-the-impact-of-covid-19-on-the-world-of-work-3/. 'NoneTyp

Some of the links were not working(Web page not found) So I used try and except to tackle that problem

## Stopwords

In [3]:
import os

directory = r"C:\Users\User\Downloads\StopWords"
words_list = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Open the file and read the lines
        with open(filepath, 'r') as file:
            lines = file.readlines()
            for line in lines:
                # Split the line by whitespace and take the first word
                first_word = line.split()[0]
                # Append the first word to the list
                words_list.append(first_word)

print(words_list[:10])

['ERNST', 'YOUNG', 'DELOITTE', 'TOUCHE', 'KPMG', 'PRICEWATERHOUSECOOPERS', 'PRICEWATERHOUSE', 'COOPERS', 'AFGHANI', 'ARIARY']


Now that we have all the stop words in one list we can just use that and remove the stopwords in the text files.

In [4]:
import os
import chardet

directory = r"C:\Users\User\Downloads\output"

# Loop through all files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Open the file and read the lines
        with open(filepath, 'r',encoding=encoding) as file:
            lines = file.readlines()
        # Open the file again in write mode
        with open(filepath, 'w', encoding=encoding) as file:
            # Loop through the lines and remove words from the list
            for line in lines:
                words = line.strip().split()
                # Remove words from the list
                new_words = [word for word in words if word not in words_list]
                # Write the new line to the file
                file.write(' '.join(new_words) + '\n')

## Creating a Positive and Negative Dictionary of words

In [5]:
p_file = r"C:\Users\User\Downloads\positive-words.txt"
with open(p_file, 'r') as f:
    p = f.readlines()
positive = [item.replace('\n', '') for item in p]
positive[:10]

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation']

In [6]:
n_file = r"C:\Users\User\Downloads\negative-words.txt"
with open(n_file, 'r') as f:
    n = f.readlines()
negative = [item.replace('\n', '') for item in n]
negative[:10]

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted']

In [7]:
#calculating the positive score
def calculate_positive_score(text):
    words = text.split()
    return sum([1 for word in words if word in positive])

def calculate_negative_score(text):
    words = text.split()
    return sum([-1*-1 for word in words if word in negative]) 

In [8]:
directory = r"C:\Users\User\Downloads\output"
# Specify the input file path
input_file = 'Input.xlsx'
link = []
url_id = []
# Load the workbook and select the active worksheet
workbook = openpyxl.load_workbook(input_file)
worksheet = workbook.active
# results = pd.DataFrame(columns=['URL_ID', 'URL', 'POSITIVE SCORE'])
for row in worksheet.iter_rows(min_row=2, values_only=True):
    link.append(row[link_column])
    url_id.append(row[url_id_column])
#     results = results.append({'URL_ID': url_id, 'URL':link},ignore_index=True)
link = pd.DataFrame(link, columns=['URL'])
url_id = pd.DataFrame(url_id, columns=['URL_ID'])

In [9]:
positive_score = []
negative_score = []
polarity_score = []
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        with open(filepath, 'r', encoding=encoding) as f:
            text = f.read()
            positive_score.append(calculate_positive_score(text))
            negative_score.append(calculate_negative_score(text))
#             polarity_score.append(calculate_polarity(text))
positive_score = pd.DataFrame(positive_score)
negative_score = pd.DataFrame(negative_score)
results = pd.concat([url_id, link, positive_score, negative_score], axis=1, ignore_index=True)

In [10]:
results.columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE']
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0


In [11]:
results['POLARITY SCORE'] = (results['POSITIVE SCORE'] - results['NEGATIVE SCORE']) / (results['POSITIVE SCORE'] + results['NEGATIVE SCORE']) * 0.000001

In [12]:
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07


In [13]:
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        with open(filepath, 'r', encoding=encoding) as f:
            text = f.read()
            words = text.split()
        results['SUBJECTIVE SCORE'] = (results['POSITIVE SCORE'] + results['NEGATIVE SCORE']) / (len(words) + 0.000001) 
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07,0.009434
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07,0.113208
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07,0.05283
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07,0.139623
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07,0.098113


In [14]:
import nltk
directory = r"C:\Users\User\Downloads\output"
avg_sentence_length = []

# Loop through each file in the folder
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(directory, filename), 'r', encoding=encoding) as file:
            text = file.read()
        
        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)
        
        # Count the number of sentences in the file
        sentence_count = len(sentences)
        
        # Count the number of words in the file
        words = nltk.word_tokenize(text)
        word_count = len(words)
        
        # Calculate the average sentence length
        avg_length = word_count / sentence_count
        
        # Append the average sentence length to the overall list
        avg_sentence_length.append(avg_length)

# Create a DataFrame from the average sentence length list
df = pd.DataFrame({'Average Sentence Length': avg_sentence_length})

In [15]:
results['AVG SENTENCE LENGTH'] = df

In [16]:
def count_syllables(word):
    vowels = "aeiouy"
    count = 0
    for index in range(len(word)):
        if word[index] in vowels:
            if index == 0:
                count += 1
            elif word[index - 1] not in vowels:
                count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def find_complex_words(file_path):
    with open(file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
    with open(file_path, "r", encoding=encoding) as file:
        text = file.read()
    words = re.findall(r"\b\w+\b", text)
    complex_words = []
    for word in words:
        if count_syllables(word) > 2:
            complex_words.append(word)
    return complex_words

In [17]:
!pip install syllables

Defaulting to user installation because normal site-packages is not writeable


In [18]:
import syllables
directory = r"C:\Users\User\Downloads\output"
percent_complex_words = []

# Loop through each file in the folder
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(directory, filename), 'r', encoding=encoding) as file:
            text = file.read()
        
        # Split the text into words
        words = text.split()
        
        # Count the number of words in the file
        word_count = len(words)
        
        # Count the number of complex words in the file
        complex_word_count = sum(1 for word in words if syllables.estimate(str(word)) > 2)
        
        # Calculate the percentage of complex words
        percent_complex = (complex_word_count / word_count) * 100
        
        # Append the percentage of complex words to the overall list
        percent_complex_words.append(percent_complex)

# Create a DataFrame from the percentage of complex words list
df = pd.DataFrame({'Percent Complex Words': percent_complex_words})

In [19]:
results['PERCENTAGE OF COMPLEX WORDS'] = df

In [20]:
results['FOG INDEX'] = 0.4 * (results['AVG SENTENCE LENGTH'] + results['PERCENTAGE OF COMPLEX WORDS'])

In [21]:
import re
import nltk
directory = r"C:\Users\User\Downloads\output"
avg_words_per_sentence = []

# Loop through each file in the folder
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(directory, filename), 'r',encoding=encoding) as file:
            text = file.read()
        
        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)
        
        # Count the total number of sentences in the file
        total_sentence_count = len(sentences)
        
        # Count the total number of words in the file
        words = nltk.word_tokenize(text)
        total_word_count = len(words)
        
        # Calculate the average number of words per sentence
        avg_words = total_word_count / total_sentence_count
        
        # Append the average number of words per sentence to the overall list
        avg_words_per_sentence.append(avg_words)

# Create a DataFrame from the average number of words per sentence list
df = pd.DataFrame({'Average Words per Sentence': avg_words_per_sentence})

In [22]:
results['AVERAGE NUMBER OF WORDS PER SENTENCE'] = df

In [23]:
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVERAGE NUMBER OF WORDS PER SENTENCE
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07,0.009434,11.4,25.0,14.56,11.4
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07,0.113208,11.8,35.528596,18.931438,11.8
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07,0.05283,13.457143,36.266667,19.889524,13.457143
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07,0.139623,14.28,44.354839,23.453935,14.28
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07,0.098113,15.868421,35.010482,20.351561,15.868421


In [24]:
import os
import re

# Define a regular expression pattern to match syllables
# We'll use a simple pattern here that just counts vowels
syllable_pattern = re.compile("[aeiouy]+", re.IGNORECASE)

# Define a function to count the number of syllables in a word
def count_syllables(word):
    return len(syllable_pattern.findall(word))

# Define a function to count the number of complex words in a file
def count_complex_words(filename):
    with open(filename, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
    # Open the file for reading
    with open(filename, "r",encoding=encoding) as file:
        # Read the file contents into a string
        contents = file.read()
        # Split the contents into words
        words = contents.split()
        # Count the number of complex words
        complex_word_count = sum(1 for word in words if count_syllables(str(word)) > 2)
        # Return the count
        return complex_word_count

# Define a function to count the number of complex words in a folder
def count_complex_words_in_folder(folder_path):
    # Initialize the results list
    result = []
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        # If the file is a text file, count the complex words
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            complex_word_count = count_complex_words(file_path)
            # Add the result to the list
            result.append({"filename": filename, "complex_word_count": complex_word_count})
    # Create a DataFrame from the results
    df = pd.DataFrame(result)
    # Return the DataFrame
    return df

In [25]:
folder_path = r"C:\Users\User\Downloads\output"
df = count_complex_words_in_folder(folder_path)
results['COMPLEX WORD COUNT'] = df['complex_word_count']

In [26]:
import string
directory = r"C:\Users\User\Downloads\output"
word_counts = []

# Loop through each file in the folder
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(directory, filename), 'r', encoding=encoding) as file:
            text = file.read()
        
        # Split the text into words
        words = text.split()
        
        # Count the number of words in the file
        word_count = len(words)
        
        # Append the word count to the overall list
        word_counts.append(word_count)

# Create a DataFrame from the word count list
df = pd.DataFrame({'Word Count': word_counts})

# Print the DataFrame
print(df)

     Word Count
0            92
1           577
2           375
3           620
4           477
..          ...
99          569
100         996
101         509
102         595
103         530

[104 rows x 1 columns]


In [27]:
results['WORD COUNT'] = df

In [28]:
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVERAGE NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07,0.009434,11.4,25.0,14.56,11.4,23.0,92.0
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07,0.113208,11.8,35.528596,18.931438,11.8,205.0,577.0
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07,0.05283,13.457143,36.266667,19.889524,13.457143,136.0,375.0
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07,0.139623,14.28,44.354839,23.453935,14.28,275.0,620.0
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07,0.098113,15.868421,35.010482,20.351561,15.868421,167.0,477.0


In [29]:
!pip install pyphen

Defaulting to user installation because normal site-packages is not writeable


In [30]:
import os
import pandas as pd
import pyphen

# Create a Pyphen object for English language
dic = pyphen.Pyphen(lang='en')

# Path to the folder containing the text files
folder_path = r"C:\Users\User\Downloads\output"

# List to store the syllable count per file
syllable_counts = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(folder_path, filename), 'r', encoding=encoding) as file:
            text = file.read()
        
        # Split the text into words
        words = text.split()
        
        # List to store the syllable count per word in the file
        syllables_per_word = []
        
        # Loop through each word in the file
        for word in words:
            # Count the syllables of the word
            syllables = len(dic.inserted(word).split('-'))
            
            # Append the syllable count to the list
            syllables_per_word.append(syllables)
        
        # Append the syllable count list to the overall list
        syllable_counts.append(syllables_per_word)

# Create a DataFrame from the syllable count list
syllable_count = pd.DataFrame({'Syllable Counts': syllable_counts})

# Print the DataFrame
print(syllable_count)

                                       Syllable Counts
0    [1, 2, 2, 1, 2, 2, 2, 5, 2, 2, 1, 1, 1, 1, 2, ...
1    [2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 2, 4, 3, 1, 2, ...
2    [1, 4, 4, 1, 3, 3, 2, 1, 2, 1, 3, 1, 1, 2, 1, ...
3    [1, 3, 4, 4, 4, 3, 2, 3, 1, 2, 3, 2, 3, 1, 2, ...
4    [2, 3, 3, 2, 1, 4, 3, 2, 1, 1, 1, 1, 1, 2, 1, ...
..                                                 ...
99   [2, 2, 1, 1, 1, 1, 1, 1, 4, 2, 1, 1, 1, 3, 1, ...
100  [2, 2, 2, 2, 1, 2, 3, 2, 2, 1, 2, 1, 4, 3, 3, ...
101  [3, 2, 4, 2, 1, 2, 1, 2, 1, 6, 1, 1, 1, 2, 1, ...
102  [2, 2, 1, 1, 2, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
103  [4, 2, 3, 1, 1, 3, 5, 3, 2, 4, 1, 2, 1, 1, 2, ...

[104 rows x 1 columns]


In [31]:
results['SYLLABLE PER WORD'] = syllable_count

In [32]:
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVERAGE NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07,0.009434,11.4,25.0,14.56,11.4,23.0,92.0,"[1, 2, 2, 1, 2, 2, 2, 5, 2, 2, 1, 1, 1, 1, 2, ..."
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07,0.113208,11.8,35.528596,18.931438,11.8,205.0,577.0,"[2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 2, 4, 3, 1, 2, ..."
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07,0.05283,13.457143,36.266667,19.889524,13.457143,136.0,375.0,"[1, 4, 4, 1, 3, 3, 2, 1, 2, 1, 3, 1, 1, 2, 1, ..."
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07,0.139623,14.28,44.354839,23.453935,14.28,275.0,620.0,"[1, 3, 4, 4, 4, 3, 2, 3, 1, 2, 3, 2, 3, 1, 2, ..."
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07,0.098113,15.868421,35.010482,20.351561,15.868421,167.0,477.0,"[2, 3, 3, 2, 1, 4, 3, 2, 1, 1, 1, 1, 1, 2, 1, ..."


In [33]:
# List to store the word counts per file
word_counts = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(folder_path, filename), 'r',encoding=encoding) as file:
            text = file.read()
        
        # Split the text into words
        words = text.split()
        
        # Initialize counters for each word
        i_count = 0
        we_count = 0
        my_count = 0
        ours_count = 0
        us_count = 0
        
        # Loop through each word in the file
        for word in words:
            # Check if the word is one of the target words
            if word.lower() == 'i':
                i_count += 1
            elif word.lower() == 'we':
                we_count += 1
            elif word.lower() == 'my':
                my_count += 1
            elif word.lower() == 'ours':
                ours_count += 1
            elif word.lower() == 'us' and (words.index(word) == 0 or words[words.index(word) - 1].lower() != 'united'):
                us_count += 1
        
        # Append the word counts to the overall list
        word_counts.append([i_count, we_count, my_count, ours_count, us_count])

# Create a DataFrame from the word count list
df = pd.DataFrame({'Word Counts': word_counts})

In [34]:
results['PERSONAL PRONOUNS'] = df

In [35]:
avg_word_lengths = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
        # Read the contents of the file
        with open(os.path.join(folder_path, filename), 'r', encoding=encoding) as file:
            text = file.read()
        
        # Split the text into words
        words = text.split()
        
        # Calculate the sum of the total number of characters in each word
        total_char_count = sum(len(word) for word in words)
        
        # Calculate the total number of words in the file
        total_word_count = len(words)
        
        # Calculate the average word length
        avg_word_length = total_char_count / total_word_count
        
        # Append the average word length to the overall list
        avg_word_lengths.append(avg_word_length)

# Create a DataFrame from the average word length list
df = pd.DataFrame({'Average Word Length': avg_word_lengths})

# Print the DataFrame
print(df)

     Average Word Length
0               6.163043
1               6.603120
2               6.581333
3               7.216129
4               6.748428
..                   ...
99              6.820738
100             6.993976
101             7.007859
102             6.786555
103             6.966038

[104 rows x 1 columns]


In [36]:
results['AVG WORD LENGTH'] = df

In [37]:
results.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVE SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVERAGE NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,3.0,2.0,2e-07,0.009434,11.4,25.0,14.56,11.4,23.0,92.0,"[1, 2, 2, 1, 2, 2, 2, 5, 2, 2, 1, 1, 1, 1, 2, ...","[0, 0, 0, 0, 0]",6.163043
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,23.0,37.0,-2.333333e-07,0.113208,11.8,35.528596,18.931438,11.8,205.0,577.0,"[2, 2, 2, 2, 1, 2, 3, 3, 1, 1, 2, 4, 3, 1, 2, ...","[0, 2, 0, 0, 0]",6.60312
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,6.0,22.0,-5.714286e-07,0.05283,13.457143,36.266667,19.889524,13.457143,136.0,375.0,"[1, 4, 4, 1, 3, 3, 2, 1, 2, 1, 3, 1, 1, 2, 1, ...","[0, 0, 0, 0, 0]",6.581333
3,40.0,https://insights.blackcoffer.com/will-machine-...,26.0,48.0,-2.972973e-07,0.139623,14.28,44.354839,23.453935,14.28,275.0,620.0,"[1, 3, 4, 4, 4, 3, 2, 3, 1, 2, 3, 2, 3, 1, 2, ...","[0, 0, 0, 0, 0]",7.216129
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,21.0,31.0,-1.923077e-07,0.098113,15.868421,35.010482,20.351561,15.868421,167.0,477.0,"[2, 3, 3, 2, 1, 4, 3, 2, 1, 1, 1, 1, 1, 2, 1, ...","[0, 0, 0, 0, 0]",6.748428


In [38]:
results.to_csv('output.csv')