<a href="https://colab.research.google.com/github/kartik2627/Machine-Learning/blob/main/sentimentanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import requests
from bs4 import BeautifulSoup
import os

In [None]:
# Load input data
input_df = pd.read_excel("/content/drive/MyDrive/Input.xlsx")

In [None]:
# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Assuming article title is within <title> tag and article text is within <p> tags
            article_title = soup.title.text.strip()
            article_text = "\n".join([p.get_text() for p in soup.find_all('p')])
            return article_title, article_text
        else:
            print(f"Failed to fetch URL: {url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"An error occurred while fetching URL: {url}. Error: {e}")
        return None, None



In [None]:
# Create a directory to store extracted articles
if not os.path.exists('extracted_articles'):
    os.makedirs('extracted_articles')


In [None]:
# Iterate over URLs, extract text, and save as text files
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    article_title, article_text = extract_article_text(url)
    if article_title and article_text:
        # Save article text as a text file
        file_name = f"extracted_articles/{url_id}.txt"
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(f"Title: {article_title}\n\n")
            file.write(article_text)
        print(f"Article text saved for URL_ID: {url_id}")

Article text saved for URL_ID: blackassign0001
Article text saved for URL_ID: blackassign0002
Article text saved for URL_ID: blackassign0003
Article text saved for URL_ID: blackassign0004
Article text saved for URL_ID: blackassign0005
Article text saved for URL_ID: blackassign0006
Article text saved for URL_ID: blackassign0007
Article text saved for URL_ID: blackassign0008
Article text saved for URL_ID: blackassign0009
Article text saved for URL_ID: blackassign0010
Article text saved for URL_ID: blackassign0011
Article text saved for URL_ID: blackassign0012
Article text saved for URL_ID: blackassign0013
Article text saved for URL_ID: blackassign0014
Article text saved for URL_ID: blackassign0015
Article text saved for URL_ID: blackassign0016
Article text saved for URL_ID: blackassign0017
Article text saved for URL_ID: blackassign0018
Article text saved for URL_ID: blackassign0019
Article text saved for URL_ID: blackassign0020
Article text saved for URL_ID: blackassign0021
Article text 

In [None]:
pip install textstat




In [None]:
from textblob import TextBlob
import textstat
import nltk

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
extracted_texts = []
with open("extracted_articles/blackassign0001.txt", "r", encoding="utf-8") as file:
    extracted_texts.append(file.read())

In [None]:
def perform_text_analysis(text):
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    # Tokenize text into words
    words = nltk.word_tokenize(text)

    # Calculate polarity and subjectivity scores
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Calculate average sentence length
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)

    # Calculate percentage of complex words
    complex_word_count = sum(1 for word in words if textstat.syllable_count(word) > 2)
    percentage_complex_words = complex_word_count / len(words)

    # Calculate FOG Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average number of words per sentence
    avg_words_per_sentence = len(words) / len(sentences)

    # Other calculations as per requirements

    return polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence

# Perform textual analysis for each extracted text
results = []
for text in extracted_texts:
    polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence = perform_text_analysis(text)
    results.append([polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence])



In [None]:
output_df = pd.DataFrame(results, columns=['POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE'])
output_df.to_csv('output.xlsx', index=False)