<a href="https://colab.research.google.com/github/jordan-dsouza/Projects/blob/main/WebScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Requests sends HTTP requests.<br>
BeautifulSoup parses HTML and XML documents.

#**Import libraries:**

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
import os
from nltk.corpus import stopwords
import string
import shutil
import string
import re

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
!pip install syllables

Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.18-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<7.0,>=5.1 (from syllables)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata, cmudict, syllables
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 7.0.1
    Uninstalling importlib-metadata-7.0.1:
      Successfully uninstalled importlib-metadata-7.0.1
Successfully installed cmudict-1.0.18 importlib-metadata-6.11.0 syllables-1.0.9


In [None]:
df = pd.read_excel("Input.xlsx")
df.head()

In [None]:
df.shape

#Extract Article Text:

##Function definition:

In [None]:
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        #Extract article title:
        title = soup.find('title').text

        #Find main content container:
        article_content = soup.find('div', class_=['td-post-content', 'tagdiv-type'])  #First class
        if article_content is None:
            article_content = soup.find('div', class_=['tdb-block-inner', 'td-fix-index'])  #Second class

        if article_content:
            #Extract text from para within main content container:
            article_text = ""
            paragraphs = article_content.find_all('p')
            for paragraph in paragraphs:
                article_text += paragraph.text + "\n"
         # Include text from <li> tags
            for li_item in article_content.find_all('li'):
                article_text += li_item.text + "\n"


            return title, article_text
        else:
            print(f"No article content found for URL: {url}")
            return None, None
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None


##Output folder with extracted text:

In [None]:
#Folder to save the articles if it doesn't exist:
output_folder = "extracted_articles"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
#Iterate over each row in the DataFrame:
for index, row in df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]

    #Extract title and text using function extract_article_text:
    title, text = extract_article_text(url)

    if title and text:
        #Save into txt file in the output folder:
        file_path = os.path.join(output_folder, f"{url_id}.txt")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(f"Title: {title}\n\n")
            file.write(text)
            print(f"Article title and text saved to {file_path}")
    else:
        print(f"Failed to extract article title and text for URL_ID: {url_id}")

##Optional: Create zip file for extracted folder.

In [None]:
import os
import shutil
from google.colab import files

# Define the folder path to be zipped
folder_path = "/content/extracted_articles"

# Check if the folder exists
if os.path.exists(folder_path):
    # Define the name for the zip file
    zip_file_name = "ExtractedArticles"

    # Zip the folder
    zip_file_path = shutil.make_archive("/content/" + zip_file_name, 'zip', folder_path)

    # Check if the zip file was created successfully
    if os.path.exists(zip_file_path):
        # Download the zip file
        files.download(zip_file_path)
    else:
        print("Error: Zip file creation failed.")
else:
    print("Error: Folder not found.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#**Text analysis:**

##Import stopwords.zip.

In [None]:
import zipfile

# Path to the uploaded zip file
zip_file_path = '/content/StopWords.zip'

# Directory to extract the contents of the zip file
extract_folder = 'stop_words_folder'

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)


##Stopwords list:

In [None]:
#Function to extract stop words from a file:
def extract_stop_words(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        stop_words = file.read().splitlines()
    return set(stop_words)

#Function to remove stop words from text
def remove_stop_words(text, stop_words):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

#Folder with stop words files:
stop_words_folder = '/content/stop_words_folder/StopWords/StopWords'

#Load stop words from all files in the folder:
stop_words_files = [os.path.join(stop_words_folder, file) for file in os.listdir(stop_words_folder)]
stop_words_list_upper = [extract_stop_words(file) for file in stop_words_files]

In [None]:
print(stop_words_list_upper)

In [None]:
stop_words_list = [word.lower() for stop_words_set in stop_words_list_upper for word in stop_words_set]
print(stop_words_list)

In [None]:
print(len(stop_words_list))

##Clean the text:

In [None]:
from google.colab import files
nltk.download('punkt')
#Path to the folder containing articles:
articles_folder_path = "/content/extracted_articles"

#Function to clean the text using stopwords:
def clean_text(text):
    word_tokens = word_tokenize(text)
    cleaned_tokens = [word for word in word_tokens if word.lower() not in stop_words_list]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

#Folder to store cleaned articles:
cleaned_articles_folder = "/content/cleaned_articles"
os.makedirs(cleaned_articles_folder, exist_ok=True)

#Process each article, clean text, and save to the cleaned_articles folder:
for article_file in os.listdir(articles_folder_path):
    article_path = os.path.join(articles_folder_path, article_file)
    with open(article_path, 'r') as file:
        article_text = file.read()
        cleaned_text = clean_text(article_text)

    #Save the cleaned text to a new file in the cleaned articles folder:
    cleaned_article_path = os.path.join(cleaned_articles_folder, article_file)
    with open(cleaned_article_path, 'w') as file:
        file.write(cleaned_text)

#Zip the cleaned articles folder:
shutil.make_archive("/content/cleaned_articles", 'zip', cleaned_articles_folder)

# Download the zip file
#files.download("/content/cleaned_articles.zip")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'/content/cleaned_articles.zip'

In [None]:
#files.download("/content/cleaned_articles.zip")

#**Positive Negative:**

1. POSITIVE SCORE
2. NEGATIVE SCORE
3. POLARITY SCORE
4. SUBJECTIVITY SCORE

##Unzip MasterDictionary:

In [None]:
#Path to the uploaded zip file:
zip_file_path = '/content/MasterDictionary.zip'

#Directory to extract the contents of the zip file:
extract_folder = 'MasterDictionary'

#Extract the contents of the zip file:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)


1. Average Sentence Length
2. Percentage of Complex Words
3. Fog Index
4. Average Number of Words per Sentence
5. Complex Words Count

In [None]:
from nltk.tokenize import sent_tokenize

def count_syllables_per_word(word):
    #Remove common suffixes that do not contribute to syllable count:
    suffixes = ["es", "ed", "ness", "er", "est", "ing", "ly", "ful", "ment", "tion"]
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[:-len(suffix)]

    #Count the number of vowels:
    vowels = "aeiou"
    syllable_count = sum(1 for i in range(len(word)) if word[i].lower() in vowels and (i == 0 or word[i-1].lower() not in vowels))

    #When 'e' at the end is silent:
    if word.endswith("e") and syllable_count > 1:
        syllable_count -= 1

    #Words with no vowels:
    if syllable_count == 0:
        syllable_count = 1

    return syllable_count

# Define the folder containing the cleaned articles
folder_path = "/content/cleaned_articles"

# Create an empty list to store the results
results = []

#Iterate over each file in the folder
for filename in os.listdir(folder_path):
    #Check if the file is a text file:
    if filename.endswith(".txt"):

        with open(os.path.join(folder_path, filename), "r") as file:
            #Read file contents:
            text = file.read()
            #Tokenize the text into sentences:
            sentences = sent_tokenize(text)
            #Tokenize the text into words:
            words = nltk.word_tokenize(text)
            #Count the number of words:
            num_words = len(words)
            #Count the number of sentences:
            num_sentences = len(sentences)

            #Count the number of complex words:
            num_complex_words = sum(count_syllables_per_word(word) > 2 for word in words)

            #Average sentence length:
            avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

            #Average number of words per sentence:
            avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

            #Percentage of complex words:
            percentage_complex_words = num_complex_words / num_words if num_words > 0 else 0

            #Fog Index:
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

            #Append the filename and metrics to the results list:
            results.append({"Filename": filename,
                            "Average Sentence Length": avg_sentence_length,
                            "Percentage of Complex Words": percentage_complex_words,
                            "Fog Index": fog_index,
                            "Average Number of Words per Sentence": avg_words_per_sentence,
                            "Complex Words Count": num_complex_words})


df_results56789 = pd.DataFrame(results)

Word count

In [None]:
# Clean and count words in text
def count_clean_words(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]

    # Remove empty strings
    words = [word for word in words if word]

    # Count the cleaned words
    num_words = len(words)

    return num_words

# Define the folder containing the cleaned articles
folder_path = "/content/cleaned_articles"

# Create an empty list to store the results
task10 = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file
        with open(os.path.join(folder_path, filename), "r") as file:
            # Read the contents of the file
            text = file.read()
            # Count the cleaned words in the text
            num_clean_words = count_clean_words(text)
            # Append the filename and cleaned word count to the results list
            task10.append({"Filename": filename, "Word Count": num_clean_words})

# Create a DataFrame from the results list
df_task10 = pd.DataFrame(task10)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Sort the DataFrame by the "Filename" column:
df_task10.sort_values(by="Filename", inplace=True)

#Reset the index:
df_task10.reset_index(drop=True, inplace=True)

#Display the sorted DataFrame:
df_task10.head()

#Personal Pronouns:

In [None]:
import re

def count_personal_pronouns(text):
    # Define the list of personal pronouns
    personal_pronouns = ['I', 'me', 'my', 'mine', 'myself',
                         'we', 'us', 'our', 'ours', 'ourselves',
                         'you', 'your', 'yours', 'yourself', 'yourselves',
                         'he', 'him', 'his', 'himself',
                         'she', 'her', 'hers', 'herself',
                         'it', 'its', 'itself',
                         'they', 'them', 'their', 'theirs', 'themselves']

    # Define the regex pattern to match personal pronouns
    pattern = r'\b(?:{})\b'.format('|'.join(personal_pronouns))

    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Exclude matches where "US" is a country name
    matches = [match for match in matches if match.lower() != "us"]

    # Count the number of matches
    count = len(matches)

    return count

# Define the folder containing the cleaned articles
folder_path = "/content/cleaned_articles"

# Create an empty list to store the results
results = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file
        with open(os.path.join(folder_path, filename), "r") as file:
            # Read the contents of the file
            text = file.read()
            # Count personal pronouns in the text
            pronoun_count = count_personal_pronouns(text)
            # Append the filename and pronoun count to the results list
            results.append({"Filename": filename, "Personal Pronoun Count": pronoun_count})

# Create a DataFrame from the results list
df_personal_pronouns = pd.DataFrame(results)

In [None]:
#Sort the DataFrame by the "Filename" column:
df_personal_pronouns.sort_values(by="Filename", inplace=True)

#Reset the index:
df_personal_pronouns.reset_index(drop=True, inplace=True)

#Display the sorted DataFrame:
df_personal_pronouns.head()

#Syllable count per word:

#PosNeg

In [None]:
#Load Positive Words:
with open(os.path.join("MasterDictionary/MasterDictionary", "positive-words.txt"), "r", encoding = "latin-1") as f:
    positive_words = set(f.read().splitlines())
#Load Negative Words:
with open(os.path.join("MasterDictionary/MasterDictionary", "negative-words.txt"), "r", encoding = "latin-1") as f:
    negative_words = set(f.read().splitlines())

#Path to the directory containing the cleaned articles:
directory_path = "cleaned_articles"
#Empty DataFrame to store scores:
scores_df = pd.DataFrame(columns=["Filename", "Positive Score", "Negative Score", "Polarity Score", "Subjectivity Score"])

#Loop over each file in the directory:
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)

    #Read text from file:
    with open(file_path, "r") as f:
        text = f.read()

    #Tokenize the text:
    words = word_tokenize(text.lower())

    #Count positive and negative words:
    positive_count = sum(1 for word in words if word in positive_words)
    negative_count = sum(-1 for word in words if word in negative_words)

    #Calculate positive score and negative score:
    #Negative score multiplied by -1 so score is +ve:
    positive_score = positive_count
    negative_score = -negative_count

    # Calculate polarity score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Calculate subjectivity score
    total_words = len(words)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    scores_df = scores_df.append({
        "Filename": filename,
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score
    }, ignore_index=True)


In [None]:
#Sort the DataFrame by the "Filename" column:
scores_df.sort_values(by="Filename", inplace=True)

#Reset the index:
scores_df.reset_index(drop=True, inplace=True)

#Display the sorted DataFrame:
scores_df.head()

In [None]:
# Define the folder containing the cleaned articles
folder_path = "/content/cleaned_articles"

# Create an empty list to store the results
results1 = []

import syllables

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file
        with open(os.path.join(folder_path, filename), "r") as file:
            # Read the contents of the file
            text = file.read()
            # Tokenize the text into words
            wordstk = re.findall(r'\b\w+\b', text)
            # Count syllables for each word
#            syllable_counts = [syllables.estimate(word) for word in wordstk]
            syllable_counts = [count_syllables_per_word(word) for word in wordstk]
            # Calculate total syllable count for the file
            total_syllables = sum(syllable_counts)
            # Append the filename and total syllable count to the results list
            results1.append({"Filename": filename, "Total Syllable Count": total_syllables})

# Create a DataFrame from the results list
df_syllable_count = pd.DataFrame(results1)


In [None]:
#Sort the DataFrame by the "Filename" column:
df_syllable_count.sort_values(by="Filename", inplace=True)

#Reset the index:
df_syllable_count.reset_index(drop=True, inplace=True)

#Display the sorted DataFrame:
df_syllable_count.head()

#**FINAL FUNCTIONS TOGETHER:**

In [None]:
#Load Positive Words:
with open(os.path.join("MasterDictionary/MasterDictionary", "positive-words.txt"), "r", encoding = "latin-1") as f:
    positive_words = set(f.read().splitlines())
#Load Negative Words:
with open(os.path.join("MasterDictionary/MasterDictionary", "negative-words.txt"), "r", encoding = "latin-1") as f:
    negative_words = set(f.read().splitlines())



def count_syllables_per_word(word):
    #Remove common suffixes that do not contribute to syllable count:
    suffixes = ["es", "ed", "ness", "er", "est", "ing", "ly", "ful", "ment", "tion"]
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[:-len(suffix)]

    #Count the number of vowels:
    vowels = "aeiou"
    syllable_count = sum(1 for i in range(len(word)) if word[i].lower() in vowels and (i == 0 or word[i-1].lower() not in vowels))

    #When 'e' at the end is silent:
    if word.endswith("e") and syllable_count > 1:
        syllable_count -= 1

    #Words with no vowels:
    if syllable_count == 0:
        syllable_count = 1

    return syllable_count


# Clean and count words in text
def count_clean_words(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]

    # Remove empty strings
    words = [word for word in words if word]

    # Count the cleaned words
    num_words = len(words)

    return num_words

def count_personal_pronouns(text):

    personal_pronouns = ['I', 'me', 'my', 'mine', 'myself',
                         'we', 'us', 'our', 'ours', 'ourselves',
                         'you', 'your', 'yours', 'yourself', 'yourselves',
                         'he', 'him', 'his', 'himself',
                         'she', 'her', 'hers', 'herself',
                         'it', 'its', 'itself',
                         'they', 'them', 'their', 'theirs', 'themselves']

    # Define the regex pattern to match personal pronouns
    pattern = r'\b(?:{})\b'.format('|'.join(personal_pronouns))

    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Exclude matches where "US" is a country name
    matches = [match for match in matches if match.lower() != "us"]

    # Count the number of matches
    count = len(matches)

    return count

In [None]:
# Define the folder containing the cleaned articles
folder_path = "/content/cleaned_articles"

# Create an empty list to store the results
results = []


# Iterate over each file in the folder:
for filename in os.listdir(folder_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file
        with open(os.path.join(folder_path, filename), "r") as file:
            # Read the contents of the file
            text = file.read()
            #Tokenize the text into words:
            words = word_tokenize(text.lower())  # Convert text to lowercase for consistency

            #Tokenize the text into sentences:
            sentences = sent_tokenize(text)

            #Count positive (+1) and negative (-1) words:
            positive_count = sum(1 for word in words if word in positive_words)
            negative_count = sum(-1 for word in words if word in negative_words)

            #Calculate positive score and negative score:
            #Negative score multiplied by -1 so score is +ve:
            positive_score = positive_count
            negative_score = -negative_count

            #Calculate polarity score:
            polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

            #Calculate subjectivity score:
            total_words = len(words)
            subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

            #Count the number of sentences:
            num_sentences = len(sentences)

            #Count the number of complex words:
            num_complex_words = sum(count_syllables_per_word(word) > 2 for word in words)

            #Average sentence length:
            avg_sentence_length = total_words / num_sentences if num_sentences > 0 else 0

            #Average number of words per sentence:
            avg_words_per_sentence = total_words / num_sentences if num_sentences > 0 else 0

            #Percentage of complex words:
            percentage_complex_words = num_complex_words / total_words if total_words > 0 else 0

            #Fog Index:
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

            #Count the cleaned words in the text:
            num_clean_words = count_clean_words(text)

            #Count personal pronouns in the text:
            pronoun_count = count_personal_pronouns(text)

            # Tokenize the text into words
            # Count syllables for each word
            #syllable_counts = [syllables.estimate(word) for word in words]
            syllable_counts = [count_syllables_per_word(word) for word in words]
            # Calculate total syllable count for the file
            total_syllables = sum(syllable_counts)

            results.append({"Filename": filename,
                       "Positive Score": positive_score,
                       "Negative Score": negative_score,
                       "Polarity Score": polarity_score,
                       "Subjectivity Score": subjectivity_score,
                       "Average Sentence Length": avg_sentence_length,
                       "Percentage of Complex Words": percentage_complex_words,
                        "Fog Index": fog_index,
                        "Average Number of Words per Sentence": avg_words_per_sentence,
                        "Complex Words Count": num_complex_words,
                         "Word Count": num_clean_words,
                          "Personal Pronoun Count": pronoun_count,
                          "Syllable count":total_syllables})
scores_df = pd.DataFrame(results)

In [None]:
#Sort the DataFrame by the "Filename" column:
scores_df.sort_values(by="Filename", inplace=True)

#Reset the index:
scores_df.reset_index(drop=True, inplace=True)

#Display the sorted DataFrame:
scores_df.head()