In [3]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# create a folder to store the articles
if not os.path.exists("articles"):
    os.mkdir("articles")

# read the Excel file containing the URLs
df = pd.read_excel('Input.xlsx')

# add an index column to the dataframe
df['Index'] = df.index

# sort the dataframe by the index column
df = df.sort_values(by='Index')

# iterate through each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    try:
        # get the article title
        title = soup.find('h1').text.strip()
    except AttributeError:
        # if there's no h1 tag, use the page title instead
        title = soup.title.text.strip()

    article = soup.find('article')
    article_text = ""

    if article:
        for p in article.find_all('p'):
            article_text += p.get_text().strip() + " "
    # save title and article_text to a text file with index and URL name in the "articles" folder
    filename = "articles/" + str(row['Index']) + "_" + url.split("/")[-2] + ".txt"
    with open(filename, "w", encoding="utf-8") as file:
        file.write(title + "\n\n")
        file.write(article_text)

    print("Article saved to", filename)


Article saved to articles/0_ai-in-healthcare-to-improve-patient-outcomes.txt
Article saved to articles/1_what-if-the-creation-is-taking-over-the-creator.txt
Article saved to articles/2_what-jobs-will-robots-take-from-humans-in-the-future.txt
Article saved to articles/3_will-machine-replace-the-human-in-the-future-of-work.txt
Article saved to articles/4_will-ai-replace-us-or-work-with-us.txt
Article saved to articles/5_man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe.txt
Article saved to articles/6_in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work.txt
Article saved to articles/7_how-neural-networks-can-be-applied-in-various-areas-in-the-future.txt
Article saved to articles/8_how-machine-learning-will-affect-your-business.txt
Article saved to articles/9_deep-learning-impact-on-areas-of-e-learning.txt
Article saved to articles/10_how-to-protect-future-data-and-its-privacy-blackcoffer.txt
Article saved to article

In [1]:
import os
import pandas as pd
from textblob import TextBlob
from textstat import flesch_reading_ease, text_standard, lexicon_count, sentence_count, syllable_count, dale_chall_readability_score

# set the directory path to the directory containing the text files
dir_path = r'C:/Users/manin/articles'

# initialize the DataFrame to hold the results
output_df = pd.DataFrame(columns=['FILE', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                                  'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

# iterate through each file in the directory
for filename in os.listdir(dir_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(dir_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # perform text analysis using TextBlob and textstat
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity
            subjectivity = blob.sentiment.subjectivity
            positive_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity > 0)
            negative_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity < 0)
            avg_sentence_length = len(blob.sentences)
            avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)
            complex_words = [word for word in blob.words if syllable_count(word) >= 3]
            complex_word_count = len(complex_words)
            percentage_complex_words = (complex_word_count / len(blob.words)) * 100
            fog = 0.4 * ( (len(blob.words) / len(blob.sentences)) + 100 * (complex_word_count / len(blob.words)) )

            avg_words_per_sentence = len(blob.words) / avg_sentence_length
            syllables_per_word = syllable_count(text) / lexicon_count(text)
            personal_pronouns = sum(1 for word, pos in blob.tags if pos == 'PRP')
            
            # add the results to the DataFrame
            output_df = output_df.append({
                'FILE': filename,
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity,
                'SUBJECTIVITY SCORE': subjectivity,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
                'FOG INDEX': fog,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': len(blob.words),
                'SYLLABLE PER WORD': syllables_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            }, ignore_index=True)

# save the output dataframe to a CSV file
output_df.to_csv('text_analysis_results1.csv', index=False)


  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_df = output_df.append({
  output_d

In [2]:
output_df


Unnamed: 0,FILE,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,0_ai-in-healthcare-to-improve-patient-outcomes...,44,10,0.136936,0.463364,75,20.487265,17.826906,24.08,370,1806,1.774157,22,5.603544
1,100_what-do-you-think-is-the-lesson-or-lessons...,2,4,-0.001188,0.340219,8,13.580247,17.582099,30.375,33,243,1.521186,8,4.72428
2,101_coronavirus-the-unexpected-challenge-for-t...,2,6,-0.00787,0.392086,14,17.692308,14.505495,18.571429,46,260,1.581395,12,4.996154
3,102_industrial-revolution-4-0-pros-and-cons.txt,3,1,0.063102,0.311879,7,24.074074,18.886772,23.142857,39,162,1.761006,3,5.179012
4,103_impact-of-covid-19-coronavirus-on-the-indi...,23,10,0.043301,0.452315,38,15.555556,16.169591,24.868421,147,945,1.589247,8,4.926984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,96_continued-demand-for-sustainability.txt,13,3,0.041275,0.419084,27,22.781065,19.127241,25.037037,154,676,1.808157,7,5.673077
110,97_coronavirus-disease-covid-19-effect-the-imp...,5,2,0.021755,0.346657,13,17.431193,17.034016,25.153846,57,327,1.708978,6,5.33945
111,98_should-people-wear-fabric-gloves-seeking-ev...,9,0,0.171146,0.372299,16,8.045977,11.918391,21.75,28,348,1.38806,9,4.482759
112,99_why-is-there-a-severe-immunological-and-inf...,5,4,0.015526,0.485565,21,14.425428,13.560647,19.47619,59,409,1.556931,11,4.757946
