In [1]:
# IMPORTING IMPORTANT LIBRARIES
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
# FILE PATH
file_path = '/content/Input.xlsx'
input_data = pd.read_excel(file_path)

In [3]:
# CREATING A FOLDER TO SAVE SCRAPPED CONTENTS FROM WEBSITES
output_folder = 'scraped_articles'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [4]:
def scrape_article(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'lxml')

            # EXTRACTING TITLE AND BODIES
            title = soup.find('h1').get_text() if soup.find('h1') else 'No Title'
            article_body = soup.find('div', class_='td-post-content').get_text() if soup.find('div', class_='td-post-content') else 'No Article Text'

            return title, article_body
        else:
            print(f"Failed to fetch URL: {url}, Status Code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Error occurred while fetching URL: {url}. Error: {str(e)}")
        return None, None

In [5]:
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # SCRAPPING THE ARTICLE CONTENTS
    title, content = scrape_article(url)

    if title and content:
        # SAVING THE CONTENT AS TXT WITH FILE NAME AS URL_ID
        file_name = f"{url_id}.txt"
        file_path = os.path.join(output_folder, file_name)

        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n{content}")

        print(f"Saved article: {file_name}")

print("Scraping complete!")

Saved article: bctech2011.txt
Saved article: bctech2012.txt
Saved article: bctech2013.txt
Saved article: bctech2014.txt
Saved article: bctech2015.txt
Saved article: bctech2016.txt
Saved article: bctech2017.txt
Saved article: bctech2018.txt
Saved article: bctech2019.txt
Saved article: bctech2020.txt
Saved article: bctech2021.txt
Saved article: bctech2022.txt
Saved article: bctech2023.txt
Saved article: bctech2024.txt
Saved article: bctech2025.txt
Saved article: bctech2026.txt
Saved article: bctech2027.txt
Saved article: bctech2028.txt
Saved article: bctech2029.txt
Saved article: bctech2030.txt
Saved article: bctech2031.txt
Saved article: bctech2032.txt
Saved article: bctech2033.txt
Saved article: bctech2034.txt
Saved article: bctech2035.txt
Saved article: bctech2036.txt
Saved article: bctech2037.txt
Saved article: bctech2038.txt
Saved article: bctech2039.txt
Saved article: bctech2040.txt
Saved article: bctech2041.txt
Saved article: bctech2042.txt
Saved article: bctech2043.txt
Saved arti

In [6]:
################################SCRAPPING DONE NOW FINDING THE VALUES OF POSITIVE NEGATIVE AND ALL THE FACTORES################################
!pip install textstat
!pip install nltk


Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [None]:
#importing and downloading directories for text analyzation of articles. we will do it by importing the natural language toolkit directories
#This library is knows as VADER it is specially designed for analyzing sentiments VADER IS KNOWN AS VALENCE AWARE DICTIONARY AND sENTIMENT REASONER


In [7]:

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [9]:
# Download the VADER lexicon (required for the first time)
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
scraped_articles_folder = 'scraped_articles'



# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
  sentiment = sia.polarity_scores(text)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [10]:
import os
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
import nltk
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate Positive, Negative, and Polarity Scores using VADER
def calculate_sentiment_scores(text):
    sentiment = sia.polarity_scores(text)
    positive_score = sentiment['pos']
    negative_score = sentiment['neg']
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return positive_score, negative_score, polarity_score

# Folder containing the scraped articles
scraped_articles_folder = 'scraped_articles'

# Create an empty list to store data for each article
data = []

# Loop through each article in the folder
for file_name in os.listdir(scraped_articles_folder):
    if file_name.endswith('.txt'):
        file_path = os.path.join(scraped_articles_folder, file_name)

        # Read the article content
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except Exception as e:
            print(f"Error reading file {file_name}: {str(e)}")
            continue  # Skip this file if there is a reading issue

        # Calculate sentiment scores
        positive_score, negative_score, polarity_score = calculate_sentiment_scores(text)

        # Append the results to the data list
        data.append({
            'File Name': file_name,
            'Positive Score': positive_score,
            'Negative Score': negative_score,
            'Polarity Score': polarity_score
        })

# Convert the data list to a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
output_excel_file = 'sentiment_analysis_results.xlsx'
df.to_excel(output_excel_file, index=False)

print(f"Sentiment analysis results saved to {output_excel_file}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment analysis results saved to sentiment_analysis_results.xlsx


In [11]:
import os
import re
from textblob import TextBlob
import textstat
import nltk
nltk.download('punkt')

# Define a function to calculate syllables per word using textstat
def syllable_count(word):
    return textstat.syllable_count(word)

# Define a function to check if a word is complex (3 or more syllables)
def is_complex_word(word):
    return syllable_count(word) >= 3

# Define a function to calculate personal pronouns
def count_personal_pronouns(text):
    pronouns = ['I', 'we', 'my', 'ours', 'us', 'he', 'she', 'him', 'her', 'they', 'them', 'you']
    words = text.lower().split()
    return sum(1 for word in words if word in pronouns)

# Define the main function to calculate all factors
def analyze_text_metrics(text):
    # Initialize TextBlob for sentiment and subjectivity
    blob = TextBlob(text)

    # Sentence Tokenization
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)

    # Word Tokenization
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # Word count and average word length
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / num_words if num_words > 0 else 0

    # Subjectivity Score
    subjectivity_score = blob.sentiment.subjectivity

    # Avg Sentence Length (Number of words per sentence)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    # Complex word count
    complex_words = [word for word in words if is_complex_word(word)]
    num_complex_words = len(complex_words)

    # Percentage of complex words
    percentage_complex_words = (num_complex_words / num_words) * 100 if num_words > 0 else 0

    # FOG Index = 0.4 * (average sentence length + percentage of complex words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Avg Number of Words per Sentence
    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    # Syllables per word
    syllables = sum(syllable_count(word) for word in words)
    avg_syllables_per_word = syllables / num_words if num_words > 0 else 0

    # Count personal pronouns
    personal_pronouns_count = count_personal_pronouns(text)

    return {
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'avg_words_per_sentence': avg_words_per_sentence,
        'complex_word_count': num_complex_words,
        'word_count': num_words,
        'avg_syllables_per_word': avg_syllables_per_word,
        'personal_pronouns': personal_pronouns_count,
        'avg_word_length': avg_word_length
    }

# Folder containing the scraped articles
scraped_articles_folder = 'scraped_articles'

# Loop through each article and calculate metrics
for file_name in os.listdir(scraped_articles_folder):
    if file_name.endswith('.txt'):
        file_path = os.path.join(scraped_articles_folder, file_name)

        # Read the article content
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Analyze text metrics
        metrics = analyze_text_metrics(text)

        # Print the result for each article
        print(f"File: {file_name}")
        print(f"Subjectivity Score: {metrics['subjectivity_score']}")
        print(f"Avg Sentence Length: {metrics['avg_sentence_length']}")
        print(f"Percentage of Complex Words: {metrics['percentage_complex_words']}")
        print(f"FOG Index: {metrics['fog_index']}")
        print(f"Avg Words per Sentence: {metrics['avg_words_per_sentence']}")
        print(f"Complex Word Count: {metrics['complex_word_count']}")
        print(f"Word Count: {metrics['word_count']}")
        print(f"Avg Syllables per Word: {metrics['avg_syllables_per_word']}")
        print(f"Personal Pronouns: {metrics['personal_pronouns']}")
        print(f"Avg Word Length: {metrics['avg_word_length']}")
        print("\n" + "-"*40 + "\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


File: bctech2058.txt
Subjectivity Score: 0.28650568181818176
Avg Sentence Length: 123.4
Percentage of Complex Words: 20.42139384116694
FOG Index: 57.528557536466785
Avg Words per Sentence: 123.4
Complex Word Count: 126
Word Count: 617
Avg Syllables per Word: 1.7747163695299837
Personal Pronouns: 4
Avg Word Length: 6.165316045380875

----------------------------------------

File: bctech2130.txt
Subjectivity Score: 0.4
Avg Sentence Length: 113.0
Percentage of Complex Words: 29.20353982300885
FOG Index: 56.88141592920354
Avg Words per Sentence: 113.0
Complex Word Count: 33
Word Count: 113
Avg Syllables per Word: 1.9734513274336283
Personal Pronouns: 0
Avg Word Length: 7.371681415929204

----------------------------------------

File: bctech2076.txt
Subjectivity Score: 0.4807017543859649
Avg Sentence Length: 56.75
Percentage of Complex Words: 18.28193832599119
FOG Index: 30.01277533039648
Avg Words per Sentence: 56.75
Complex Word Count: 83
Word Count: 454
Avg Syllables per Word: 1.594713

In [None]:
############################### OUTPUT FORMATING USING PANDAS TEXTBLOB TEXTSTAT NLTK ####################################

In [12]:
import os
import re
import pandas as pd
from textblob import TextBlob
import textstat
import nltk
nltk.download('punkt')

# Define a function to calculate syllables per word using textstat
def syllable_count(word):
    return textstat.syllable_count(word)

# Define a function to check if a word is complex (3 or more syllables)
def is_complex_word(word):
    return syllable_count(word) >= 3

# Define a function to calculate personal pronouns
def count_personal_pronouns(text):
    pronouns = ['I', 'we', 'my', 'ours', 'us', 'he', 'she', 'him', 'her', 'they', 'them', 'you']
    words = text.lower().split()
    return sum(1 for word in words if word in pronouns)

# Define the main function to calculate all factors
def analyze_text_metrics(text):
    # Initialize TextBlob for sentiment and subjectivity
    blob = TextBlob(text)

    # Sentence Tokenization
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)

    # Word Tokenization
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # Word count and average word length
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / num_words if num_words > 0 else 0

    # Subjectivity Score
    subjectivity_score = blob.sentiment.subjectivity

    # Avg Sentence Length (Number of words per sentence)
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    # Complex word count
    complex_words = [word for word in words if is_complex_word(word)]
    num_complex_words = len(complex_words)

    # Percentage of complex words
    percentage_complex_words = (num_complex_words / num_words) * 100 if num_words > 0 else 0

    # FOG Index = 0.4 * (average sentence length + percentage of complex words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Avg Number of Words per Sentence
    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    # Syllables per word
    syllables = sum(syllable_count(word) for word in words)
    avg_syllables_per_word = syllables / num_words if num_words > 0 else 0

    # Count personal pronouns
    personal_pronouns_count = count_personal_pronouns(text)

    return {
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'avg_words_per_sentence': avg_words_per_sentence,
        'complex_word_count': num_complex_words,
        'word_count': num_words,
        'avg_syllables_per_word': avg_syllables_per_word,
        'personal_pronouns': personal_pronouns_count,
        'avg_word_length': avg_word_length
    }

# Folder containing the scraped articles
scraped_articles_folder = 'scraped_articles'

# Create an empty list to store data for each article
data = []

# Loop through each article and calculate metrics
for file_name in os.listdir(scraped_articles_folder):
    if file_name.endswith('.txt'):
        file_path = os.path.join(scraped_articles_folder, file_name)

        # Read the article content
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Analyze text metrics
        metrics = analyze_text_metrics(text)

        # Append the results to the data list
        data.append({
            'File Name': file_name,
            'Subjectivity Score': metrics['subjectivity_score'],
            'Avg Sentence Length': metrics['avg_sentence_length'],
            'Percentage of Complex Words': metrics['percentage_complex_words'],
            'FOG Index': metrics['fog_index'],
            'Avg Words per Sentence': metrics['avg_words_per_sentence'],
            'Complex Word Count': metrics['complex_word_count'],
            'Word Count': metrics['word_count'],
            'Avg Syllables per Word': metrics['avg_syllables_per_word'],
            'Personal Pronouns': metrics['personal_pronouns'],
            'Avg Word Length': metrics['avg_word_length']
        })

# Convert the data list to a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
output_excel_file = 'Updated_Output_Data_Structure.xlsx'
df.to_excel(output_excel_file, index=False)

print(f"Analysis results saved to {output_excel_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Analysis results saved to Updated_Output_Data_Structure.xlsx


In [14]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_csv('/content/drive/My Drive//content/Updated_Output_Data_Structure.xlsx')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive//content/Updated_Output_Data_Structure.xlsx'