# *Downloading all the necessary libraries*

In [20]:
!pip install pandas 
!pip install requests 
!pip install beautifulsoup4 
!pip install openpyxl
!pip install nltk
!pip install syllapy



# *Data Extraction*

***While Data Extraction it was noticed that 2 articles particularily Article 36 and Article 49 was not available on the internet thus I removed those entries from the "Input.xlsx" and then performed all the analysis on the remaining Articles.***

In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

def extract_article_content(soup):
    content_div = soup.find('div', class_='td-post-content tagdiv-type')
    if not content_div:
        content_div = soup.find('div', class_='td_block_wrap tdb_single_content tdi_130 td-pb-border-top td_block_template_1 td-post-content tagdiv-type')

    if content_div:
        article_content = content_div.get_text(strip=True)
    else:
        article_content = ""
    return article_content

excel_file = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/Input.xlsx'
df = pd.read_excel(excel_file)

output_dir = 'articles_extracted_content'
os.makedirs(output_dir, exist_ok=True)

for index, row in df.iterrows():
    url_id = str(row['URL_ID'])
    url = row['URL']
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'
        
        content = extract_article_content(soup)
        
        file_content = "Title: " + title + "\n\n\n" + content
        
        file_path = os.path.join(output_dir, url_id + ".txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(file_content)
        
        print("Successfully saved: " + file_path)
        
    except requests.RequestException as e:
        print("Failed to fetch " + url + ": " + str(e))
    except Exception as e:
        print("An error occurred for " + url + ": " + str(e))

Successfully saved: articles_extracted_content\blackassign0001.txt
Successfully saved: articles_extracted_content\blackassign0002.txt
Successfully saved: articles_extracted_content\blackassign0003.txt
Successfully saved: articles_extracted_content\blackassign0004.txt
Successfully saved: articles_extracted_content\blackassign0005.txt
Successfully saved: articles_extracted_content\blackassign0006.txt
Successfully saved: articles_extracted_content\blackassign0007.txt
Successfully saved: articles_extracted_content\blackassign0008.txt
Successfully saved: articles_extracted_content\blackassign0009.txt
Successfully saved: articles_extracted_content\blackassign0010.txt
Successfully saved: articles_extracted_content\blackassign0011.txt
Successfully saved: articles_extracted_content\blackassign0012.txt
Successfully saved: articles_extracted_content\blackassign0013.txt
Successfully saved: articles_extracted_content\blackassign0014.txt
Successfully saved: articles_extracted_content\blackassign0015

# *Data Cleaning and Pre-Processing*

In [36]:
import os
import re

def load_stop_words(stop_words_dir):
    stop_words = set()
    for file_name in os.listdir(stop_words_dir):
        file_path = os.path.join(stop_words_dir, file_name)
        stop_words.update(read_file_with_encodings(file_path))
    return stop_words

def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return {line.strip().lower() for line in file}
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError("Cannot decode file " + file_path + " with available encodings")

def remove_punctuation(text):
    return re.sub(r'[?!.;,:-]', '', text)

def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = [remove_punctuation(word) for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

stop_words_dir = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/StopWords'
articles_dir = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/articles_extracted_content'
cleaned_articles_dir = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles'

stop_words = load_stop_words(stop_words_dir)
os.makedirs(cleaned_articles_dir, exist_ok=True)

for file_name in os.listdir(articles_dir):
    file_path = os.path.join(articles_dir, file_name)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    title = lines[0].strip()
    content = ' '.join(line.strip() for line in lines[2:])
    cleaned_content = remove_stop_words(content, stop_words)
    cleaned_file_content = title + "\n\n\n" + cleaned_content
    cleaned_file_path = os.path.join(cleaned_articles_dir, file_name)
    with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file:
        cleaned_file.write(cleaned_file_content)
    
    print("Successfully cleaned and saved:", cleaned_file_path)

print("Cleaning process completed.")

Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0001.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0002.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0003.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0004.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0005.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0006.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0007.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles\blackassign0008.txt
Successfully cleaned and saved: C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_art

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HITESH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# *Performing the Sentiment Analysis and creating the Excel file*

In [37]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize, sent_tokenize
import re

def load_master_dictionary(positive_file, negative_file):
    positive_words = read_file_with_encodings(positive_file)
    negative_words = read_file_with_encodings(negative_file)
    return positive_words, negative_words

def read_file_with_encodings(file_path):
    encodings = ['utf-8', 'latin-1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                words = {line.strip().lower() for line in file}
            return words
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError("Cannot decode file " + file_path + " with available encodings")

def count_syllables(word):
    word = word.lower()
    word = re.sub(r'[^a-z]', '', word)  # Remove non-alphabet characters
    if word.endswith('es') or word.endswith('ed'):
        word = word[:-2]
    syllable_count = sum(1 for char in word if char in 'aeiou')
    syllable_count = max(syllable_count, 1)  # Ensure at least one syllable
    return syllable_count

def count_complex_words(tokens):
    return sum(1 for token in tokens if count_syllables(token) > 2)

def count_personal_pronouns(text):
    pronouns_pattern = r'\b(i|we|my|ours|us)\b(?!\bUS\b)'
    pronouns = re.findall(pronouns_pattern, text, re.IGNORECASE)
    return len(pronouns)

def calculate_average_word_length(tokens):
    total_characters = sum(len(token) for token in tokens)
    average_word_length = total_characters / (len(tokens) + 0.000001)
    return average_word_length

def scores_and_matrices(text, positive_words, negative_words):
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text)
    total_words = len(tokens)
    total_sentences = len(sentences)
    
    positive_score = sum(1 for token in tokens if token in positive_words)
    negative_score = sum(1 for token in tokens if token in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    
    average_sentence_length = total_words / total_sentences
    complex_words_count = count_complex_words(tokens)
    percentage_complex_words = complex_words_count / total_words
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    average_number_words_per_sentence = total_words / total_sentences
    
    syllable_counts = [count_syllables(word) for word in tokens]
    
    personal_pronouns_count = count_personal_pronouns(text)
    average_word_length = calculate_average_word_length(tokens)
    
    return (positive_score, negative_score, polarity_score, subjectivity_score, 
            average_sentence_length, percentage_complex_words, fog_index, 
            average_number_words_per_sentence, syllable_counts, complex_words_count, 
            total_words, personal_pronouns_count, average_word_length)

cleaned_articles_dir = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/cleaned_articles'
positive_words_file = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/MasterDictionary/positive-words.txt'
negative_words_file = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/MasterDictionary/negative-words.txt'
final_submission_file = 'C:/Users/HITESH/OneDrive/Desktop/BlackCoffer/Final_Submission.xlsx'

positive_words, negative_words = load_master_dictionary(positive_words_file, negative_words_file)
article_files = os.listdir(cleaned_articles_dir)

final_submission_data = []

for file_name in article_files:
    file_path = os.path.join(cleaned_articles_dir, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    (positive_score, negative_score, polarity_score, subjectivity_score, 
     average_sentence_length, percentage_complex_words, fog_index, 
     average_number_words_per_sentence, syllable_counts, complex_words_count, 
     total_words, personal_pronouns_count, average_word_length) = scores_and_matrices(text, positive_words, negative_words)

    url_id = file_name.split('.')[0]
    url = ""
    
    final_submission_data.append({
        'URL_ID': url_id,
        'URL': url,
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': average_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': average_number_words_per_sentence,
        'COMPLEX WORD COUNT': complex_words_count,
        'WORD COUNT': total_words,
        'SYLLABLE PER WORD': syllable_counts,
        'PERSONAL PRONOUNS': personal_pronouns_count,
        'AVG WORD LENGTH': average_word_length
    })

final_submission_df = pd.DataFrame(final_submission_data)
final_submission_df.to_excel(final_submission_file, index=False)

print("Scores calculated and Final Submission Excel file created successfully.")

Scores calculated and Final Submission Excel file created successfully.


# ***Thank You***