In [2]:
!pip install spacy textblob pandas nltk



In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
import spacy
import textblob
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
import re
import os
import pandas as pd
#from google.colab import files
import requests
from bs4 import BeautifulSoup

def load_stop_words(stop_words_folder):
    stop_words = set()
    for filename in os.listdir(stop_words_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(stop_words_folder, filename)
            with open(file_path, 'r', encoding='latin-1') as file:
                stop_words.update(set(file.read().splitlines()))
    return stop_words


def load_positive_negative_words(positive_words_path, negative_words_path):
    with open(positive_words_path, 'r', encoding='ISO-8859-1') as file:
        positive_words = set(file.read().splitlines())

    with open(negative_words_path, 'r', encoding='ISO-8859-1') as file:
        negative_words = set(file.read().splitlines())

    return positive_words, negative_words


def clean_text(article_text, stop_words):
    cleaned_tokens = [word.lower() for word in word_tokenize(article_text) if word.isalpha() and word.lower() not in stop_words]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

def analyze_text(article_text, stop_words, positive_words, negative_words):
    cleaned_text = clean_text(article_text, stop_words)
    cleaned_tokens = [word.lower() for word in word_tokenize(cleaned_text) if word.isalpha() and word.lower() not in stop_words]

    sentiment_analysis = TextBlob(cleaned_text)
    positive_score = sum(1 for word in sentiment_analysis.words if word in positive_words)
    negative_score = sum(1 for word in sentiment_analysis.words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / ((len(cleaned_text.split()) + 0.000001))

    doc = nlp(article_text)
    num_sentences = len(list(doc.sents))
    avg_sentence_length = len(doc) / num_sentences
    word_count = len(re.findall(r'\b\w+\b', cleaned_text))
    avg_words_per_sentence = word_count / num_sentences
    complex_word_count = sum(1 for token in doc if token.is_alpha and len(token.text) > 2)
    percentage_of_complex_words = (complex_word_count / word_count) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

    syllable_per_word = syllable_count(cleaned_text) / word_count
    personal_pronouns = count_personal_pronouns(doc)
    avg_word_length = sum(len(word) for word in cleaned_tokens) / len(cleaned_tokens)

    return [positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_of_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, word_count,
            syllable_per_word, personal_pronouns, avg_word_length]


def syllable_count(word):
    # Simple syllable count, not perfect but can be used as an approximation
    vowels = "aeiouy"
    count = 0
    for char in word:
        if char.lower() in vowels:
            count += 1
    return max(count, 1)  # At least one syllable

def count_personal_pronouns(doc):
    personal_pronouns = ['I', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'we', 'us', 'our', 'ours', 'ourselves']
    return sum(1 for token in doc if token.text.lower() in personal_pronouns)

# ... (Rest of the code remains unchanged)

def extract_article(url, output_directory):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.title.text.strip().replace('- Blackcoffer Insights', '')
        article_paragraphs = soup.find_all('p')
        article_text = ' '.join([p.text.strip() for p in article_paragraphs])

        footer_text = "Contact us: hello@blackcoffer.com © All Right Reserved, Blackcoffer(OPC) Pvt. Ltd"
        article_text = article_text.replace(footer_text, '')

        full_output_path = os.path.join(output_directory, output_filename)
        print(f"Output Filename: {full_output_path}")

        with open(full_output_path, 'w', encoding='utf-8') as file:
            file.write(f"{title}\n\n{article_text}")

        return title, article_text
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return None, None



# Set the working directory
os.chdir("C:/Users/senth/OneDrive/Desktop/New folder")
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')

# Load Stop Words, Positive Words, and Negative Words
stop_words_folder = 'C:/Users/senth/OneDrive/Desktop/New folder/StopWords'
positive_words_path = 'C:/Users/senth/OneDrive/Desktop/New folder/MasterDictionary/positive-words.txt'
negative_words_path = 'C:/Users/senth/OneDrive/Desktop/New folder/MasterDictionary/negative-words.txt'
stop_words = load_stop_words(stop_words_folder)
positive_words, negative_words = load_positive_negative_words(positive_words_path, negative_words_path)

# Read input data from Excel file
df = pd.read_excel('C:/Users/senth/OneDrive/Desktop/New folder/Input.xlsx')

# Create DataFrame for output
output_directory = "output_data"
os.makedirs(output_directory, exist_ok=True)

output_columns = pd.read_excel('C:/Users/senth/OneDrive/Desktop/New folder/Output Data Structure.xlsx', header=None, names=['Variable'])['Variable'].tolist()
df_output = pd.DataFrame(columns=output_columns)

data_list = []

for index, row in df.iterrows():
    numeric_part = ''.join(filter(str.isdigit, row['URL_ID']))

    if numeric_part:
        url_id = int(numeric_part)
        url = row['URL']

        output_filename = f"blackassign{url_id:04d}.txt"
        output_filepath = os.path.abspath(os.path.join(output_directory, output_filename))
        os.makedirs(os.path.dirname(output_filepath), exist_ok=True)

        title, article_text = extract_article(url, output_directory)

        if article_text is not None:
            analysis_results = analyze_text(article_text, stop_words, positive_words, negative_words)

            new_row = {
                'URL_ID': url_id,
                'URL': url,
                'POSITIVE SCORE': analysis_results[0],
                'NEGATIVE SCORE': analysis_results[1],
                'POLARITY SCORE': analysis_results[2],
                'SUBJECTIVITY SCORE': analysis_results[3],
                'AVG SENTENCE LENGTH': analysis_results[4],
                'PERCENTAGE OF COMPLEX WORDS': analysis_results[5],
                'FOG INDEX': analysis_results[6],
                'AVG NUMBER OF WORDS PER SENTENCE': analysis_results[7],
                'COMPLEX WORD COUNT': analysis_results[8],
                'WORD COUNT': analysis_results[9],
                'SYLLABLE PER WORD': analysis_results[10],
                'PERSONAL PRONOUNS': analysis_results[11],
                'AVG WORD LENGTH': analysis_results[12],
            }

            data_list.append(new_row)

df_output = pd.DataFrame(data_list)

output_file = 'Output Data Structure.xlsx'
df_output.to_excel(output_file, index=False)
print(f"Textual analysis results saved to {output_file}")


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [13]:
df_result = pd.read_excel('/content/gdrive/MyDrive/NLP BLACKCOFFER PROJECT/Output Data Structure.xlsx')

# Display the DataFrame
print("Result DataFrame:")
print(df_result)

Result DataFrame:
    URL_ID                                                URL  POSITIVE SCORE  \
0        1  https://insights.blackcoffer.com/rising-it-cit...              11   
1        2  https://insights.blackcoffer.com/rising-it-cit...              60   
2        3  https://insights.blackcoffer.com/internet-dema...              43   
3        4  https://insights.blackcoffer.com/rise-of-cyber...              41   
4        5  https://insights.blackcoffer.com/ott-platform-...              26   
..     ...                                                ...             ...   
93      96  https://insights.blackcoffer.com/what-is-the-r...              32   
94      97  https://insights.blackcoffer.com/impact-of-cov...              26   
95      98  https://insights.blackcoffer.com/contribution-...               5   
96      99  https://insights.blackcoffer.com/how-covid-19-...              18   
97     100  https://insights.blackcoffer.com/how-will-covi...              33   

    NEGAT