In [6]:
import os
import nltk
import pandas as pd
from textblob import TextBlob
import pyphen

# Download the required NLTK data (if you haven't already)
nltk.download('punkt')

# Function to calculate the number of complex words in a text
def count_complex_words(text):
    # Define a list of exceptions for words ending in "es" or "ed"
    exception_suffixes = ["es", "ed"]
    
    # Initialize a variable to count complex words
    complex_word_count = 0
    
    # Function to count syllables in a word
    def count_syllables(word):
        dic = pyphen.Pyphen(lang='en')
        word = word.lower()
        syllables = len(dic.inserted(word).split('-'))
        return syllables

    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Calculate the number of complex words
    for word in words:
        syllable_count = count_syllables(word)
        if syllable_count > 2 and not any(word.endswith(suffix) for suffix in exception_suffixes):
            complex_word_count += 1

    return complex_word_count

# Function to calculate various parameters for a given text
def calculate_parameters(text):
    blob = TextBlob(text)
    word_count = len(nltk.word_tokenize(text))
    sentence_count = len(blob.sentences)
    avg_sentence_length = word_count / sentence_count
    complex_word_count = count_complex_words(text)
    
    # Calculate polarity and subjectivity scores
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Calculate percentage of complex words
    percentage_of_complex_words = (complex_word_count / word_count) * 100

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = word_count / sentence_count

    # Calculate syllables per word
    syllables_per_word = sum(len(word) for word in text.split()) / word_count

    # Count personal pronouns
    personal_pronouns = text.count('I') + text.count('me') + text.count('my') + text.count('mine') + text.count('myself')

    # Calculate average word length
    avg_word_length = sum(len(word) for word in text.split()) / word_count

    return {
        "POSITIVE SCORE": polarity_score,
        "NEGATIVE SCORE": (-polarity_score*(-1)),  # Invert polarity for negative score
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_of_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length,
    }

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=[
    "File",
    "POSITIVE SCORE",
    "NEGATIVE SCORE",
    "POLARITY SCORE",
    "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH",
    "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT",
    "WORD COUNT",
    "SYLLABLE PER WORD",
    "PERSONAL PRONOUNS",
    "AVG WORD LENGTH",
])

# Directory containing the text files
directory = "C:\\Users\\inevi\\extracted_articles"

# Loop through each text file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
            text = file.read()
            parameters = calculate_parameters(text)
            results_df = results_df.append({
                "File": filename,
                **parameters
            }, ignore_index=True)

# Save the results to an Excel file
results_df.to_excel("text_analysis_results.xlsx", index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\inevi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  result

  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = resul

  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = resul

  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


In [1]:
--allow-chromium-download

NameError: name 'allow' is not defined