In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Read input.xlsx
input_file = "input.xlsx"
df = pd.read_excel(input_file)

# Function to extract article text from URL
def extract_article_text(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title and article text
        title = soup.title.text.strip()
        article_text = ' '.join([p.text for p in soup.find_all('p')])

        return title, article_text

    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None, None

# Iterate through URLs and extract data
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, article_text = extract_article_text(url)

    if title and article_text:
        # Save the extracted article in a text file
        output_file = f"{url_id}.txt"
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n{article_text}")

        print(f"Data extracted from {url} and saved to {output_file}")
    else:
        print(f"Failed to extract data from {url}")



Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved to blackassign0001.txt
Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved to blackassign0002.txt
Data extracted from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved to blackassign0003.txt
Data extracted from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved to blackassign0004.txt
Data extracted from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved to blackassign0005.txt
Data extracted from https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/ and saved to blackassi

Data extracted from https://insights.blackcoffer.com/how-python-became-the-first-choice-for-data-science/ and saved to blackassign0053.txt
Data extracted from https://insights.blackcoffer.com/how-google-fit-measure-heart-and-respiratory-rates-using-a-phone/ and saved to blackassign0054.txt
Data extracted from https://insights.blackcoffer.com/what-is-the-future-of-mobile-apps/ and saved to blackassign0055.txt
Data extracted from https://insights.blackcoffer.com/impact-of-ai-in-health-and-medicine/ and saved to blackassign0056.txt
Data extracted from https://insights.blackcoffer.com/telemedicine-what-patients-like-and-dislike-about-it/ and saved to blackassign0057.txt
Data extracted from https://insights.blackcoffer.com/how-we-forecast-future-technologies/ and saved to blackassign0058.txt
Data extracted from https://insights.blackcoffer.com/can-robots-tackle-late-life-loneliness/ and saved to blackassign0059.txt
Data extracted from https://insights.blackcoffer.com/embedding-care-robots-i

In [2]:
import sys
print(sys.version)
!pip list


3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:38:37) [MSC v.1916 64 bit (AMD64)]
Package                       Version
----------------------------- ---------------
absl-py                       2.0.0
aiobotocore                   2.4.2
aiofiles                      22.1.0
aiohttp                       3.8.3
aioitertools                  0.7.1
aiosignal                     1.2.0
aiosqlite                     0.18.0
alabaster                     0.7.12
anaconda-catalogs             0.2.0
anaconda-client               1.12.0
anaconda-navigator            2.4.2
anaconda-project              0.11.1
ann-visualizer                2.5
annotated-types               0.6.0
anyio                         3.5.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.3
astroid                       2.14.2
astropy                       5.1
asttokens                     2.0.5
astunparse              

In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB 5.6 MB/s eta 0:00:03
     --- ------------------------------------ 1.0/12.8 MB 12.7 MB/s eta 0:00:01
     ---- ----------------------------------- 1.4/12.8 MB 11.0 MB/s eta 0:00:02
     ---- ----------------------------------- 1.4/12.8 MB 11.0 MB/s eta 0:00:02
     ---- ----------------------------------- 1.6/12.8 MB 7.1 MB/s eta 0:00:02
     ----- ---------------------------------- 1.9/12.8 MB 7.0 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 7.0 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 7.0 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.8 MB 6.6 MB/s eta 0:00:02
     --------- -----------------------

In [19]:
import spacy
import pandas as pd

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to perform textual analysis and compute variables
def analyze_text(text):
    doc = nlp(text)

    # Compute variables
    word_count = len(doc)
    sentence_count = len(list(doc.sents))
    unique_words = len(set([token.text.lower() for token in doc if token.is_alpha]))
    avg_word_length = sum(len(token) for token in doc if token.is_alpha) / word_count if word_count > 0 else 0

    return word_count, sentence_count, unique_words, avg_word_length

# Read the output structure file
output_structure_file = "Output Data Structure.xlsx"
output_df = pd.read_excel(output_structure_file)

# Iterate through the extracted text files
for index, row in output_df.iterrows():
    url_id = row['URL_ID']
    file_path = f"{url_id}.txt"

    try:
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            article_content = file.read()

        # Perform textual analysis and compute variables
        word_count, sentence_count, unique_words, avg_word_length = analyze_text(article_content)

        # Update the output DataFrame with computed variables
        output_df.at[index, 'Word_Count'] = word_count
        output_df.at[index, 'Sentence_Count'] = sentence_count
        output_df.at[index, 'Unique_Words'] = unique_words
        output_df.at[index, 'Avg_Word_Length'] = avg_word_length

        print(f"Text analysis completed for {file_path}")

    except Exception as e:
        print(f"Error analyzing text for {file_path}: {e}")

# Save the updated output DataFrame to a new Excel file
output_df.to_excel("text_analysis_output.xlsx", index=False)
print("Text analysis results saved to text_analysis_output.xlsx")


Text analysis completed for blackassign0001.txt
Text analysis completed for blackassign0002.txt
Text analysis completed for blackassign0003.txt
Text analysis completed for blackassign0004.txt
Text analysis completed for blackassign0005.txt
Text analysis completed for blackassign0006.txt
Text analysis completed for blackassign0007.txt
Text analysis completed for blackassign0008.txt
Text analysis completed for blackassign0009.txt
Text analysis completed for blackassign0010.txt
Text analysis completed for blackassign0011.txt
Text analysis completed for blackassign0012.txt
Text analysis completed for blackassign0013.txt
Text analysis completed for blackassign0014.txt
Text analysis completed for blackassign0015.txt
Text analysis completed for blackassign0016.txt
Text analysis completed for blackassign0017.txt
Text analysis completed for blackassign0018.txt
Text analysis completed for blackassign0019.txt
Text analysis completed for blackassign0020.txt
Text analysis completed for blackassign0

In [6]:
!pip install textblob


Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [8]:
!pip install pyphen


Collecting pyphen
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
     - -------------------------------------- 0.1/2.0 MB 1.7 MB/s eta 0:00:02
     ---- ----------------------------------- 0.2/2.0 MB 2.8 MB/s eta 0:00:01
     -------- ------------------------------- 0.4/2.0 MB 3.1 MB/s eta 0:00:01
     ---------- ----------------------------- 0.5/2.0 MB 3.1 MB/s eta 0:00:01
     --------------- ------------------------ 0.8/2.0 MB 3.4 MB/s eta 0:00:01
     -------------------- ------------------- 1.0/2.0 MB 3.7 MB/s eta 0:00:01
     ----------------------- ---------------- 1.1/2.0 MB 3.6 MB/s eta 0:00:01
     -------------------------- ------------- 1.3/2.0 MB 4.0 MB/s eta 0:00:01
     -------------------------- ------------- 1.3/2.0 MB 4.0 MB/s eta 0:00:01
     -------------------------- ------------- 1.3/2.0 MB 4.0 MB/s eta 0:00:01
     ----------------------------- ---------- 1.5/2.0 MB 2.9 MB/s eta 0:00:01

In [11]:
!pip install nltk regex textblob syllables


Collecting syllables
  Obtaining dependency information for syllables from https://files.pythonhosted.org/packages/27/c5/96e282163836a83d9f0cfdc5792e5bbb8baab18ee2e2dca4a85875588f1b/syllables-1.0.9-py3-none-any.whl.metadata
  Downloading syllables-1.0.9-py3-none-any.whl.metadata (2.4 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Obtaining dependency information for cmudict<2.0.0,>=1.0.11 from https://files.pythonhosted.org/packages/9c/8a/3e16710d858c61232f3ac5423c76d2dae112e77e7e7433b036711382e69e/cmudict-1.0.16-py3-none-any.whl.metadata
  Downloading cmudict-1.0.16-py3-none-any.whl.metadata (3.5 kB)
Collecting importlib-resources>=5 (from cmudict<2.0.0,>=1.0.11->syllables)
  Obtaining dependency information for importlib-resources>=5 from https://files.pythonhosted.org/packages/93/e8/facde510585869b5ec694e8e0363ffe4eba067cb357a8398a55f6a1f8023/importlib_resources-6.1.1-py3-none-any.whl.metadata
  Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)
Downl

In [22]:
!pip install textstat


Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
     ---------------------------------------- 0.0/105.1 kB ? eta -:--:--
     -------------------------------------  102.4/105.1 kB 3.0 MB/s eta 0:00:01
     -------------------------------------- 105.1/105.1 kB 2.0 MB/s eta 0:00:00
Installing collected packages: textstat
Successfully installed textstat-0.7.3


In [24]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kannu Priya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [9]:
import spacy
from spacy.tokens import Token
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Registering the 'syllables' extension attribute
Token.set_extension('syllables', default=None, force=True)

# Function to clean text using stop words
def clean_text(text):
    stop_words = set(stopwords.words("english"))
    doc = nlp(text)
    cleaned_text = ' '.join([token.text for token in doc if token.text.lower() not in stop_words])
    return cleaned_text

# Function to create a dictionary of positive and negative words
def create_sentiment_dictionary():
    # You can customize the positive and negative word lists based on your requirements
    positive_words = ['good', 'happy', 'positive']
    negative_words = ['bad', 'sad', 'negative']
    
    sentiment_dict = {'positive': positive_words, 'negative': negative_words}
    return sentiment_dict

# Function to extract derived variables
def extract_derived_variables(text):
    # Add your logic to extract derived variables based on the text content
    # For example, you can count the occurrences of specific phrases or patterns
    derived_variable_1 = text.count('example_phrase_1')
    derived_variable_2 = text.count('example_phrase_2')

    return derived_variable_1, derived_variable_2

# Function to perform textual analysis and compute variables
def analyze_text(text):
    doc = nlp(text)

    # Compute variables
    word_count = len(doc)
    sentence_count = len(list(doc.sents))
    unique_words = len(set([token.text.lower() for token in doc if token.is_alpha]))
    avg_word_length = sum(len(token) for token in doc if token.is_alpha) / word_count if word_count > 0 else 0

    # Sentiment analysis using TextBlob
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Additional features
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    complex_word_count = sum(1 for token in doc if token.is_alpha and len(token) > 2)  # Adjust the condition for complexity
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Count personal pronouns
    personal_pronouns = sum(1 for token in doc if token.pos_ == 'PRON' and token.text.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])

    # Syllable count per word
    for token in doc:
        if token.is_alpha:
            token._.syllables = count_syllables(token.text)

    syllable_per_word = sum(token._.syllables for token in doc if token.is_alpha) / word_count if word_count > 0 else 0

    return word_count, sentence_count, unique_words, avg_word_length, \
           polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, \
           fog_index, complex_word_count, syllable_per_word, personal_pronouns

# Function to count syllables in a word
def count_syllables(word):
    # Add your syllable counting logic here
    # This is a simple example and may not cover all cases
    # You might want to use a more comprehensive method
    vowels = 'aeiouy'
    count = 0
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

# Read the output structure file
output_structure_file = "Output Data Structure.xlsx"
output_df = pd.read_excel(output_structure_file)

# Iterate through the extracted text files
for index, row in output_df.iterrows():
    url_id = row['URL_ID']
    file_path = f"{url_id}.txt"

    try:
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            article_content = file.read()

        # Clean text using stop words
        cleaned_content = clean_text(article_content)

        # Perform textual analysis and compute variables
        word_count, sentence_count, unique_words, avg_word_length, \
        polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, \
        fog_index, complex_word_count, syllable_per_word, personal_pronouns = analyze_text(cleaned_content)

        # Create sentiment dictionary
        sentiment_dict = create_sentiment_dictionary()

        # Extract derived variables
        derived_variable_1, derived_variable_2 = extract_derived_variables(cleaned_content)

        # Update the output DataFrame with computed variables
        output_df.at[index, 'Word_Count'] = word_count
        output_df.at[index, 'Sentence_Count'] = sentence_count
        output_df.at[index, 'Unique_Words'] = unique_words
        output_df.at[index, 'Avg_Word_Length'] = avg_word_length
        output_df.at[index, 'Derived_Variable_1'] = derived_variable_1
        output_df.at[index, 'Derived_Variable_2'] = derived_variable_2
        output_df.at[index, 'Polarity_Score'] = polarity_score
        output_df.at[index, 'Subjectivity_Score'] = subjectivity_score
        output_df.at[index, 'Avg_Sentence_Length'] = avg_sentence_length
        output_df.at[index, 'Percentage_Complex_Words'] = percentage_complex_words
        output_df.at[index, 'Fog_Index'] = fog_index
        output_df.at[index, 'Complex_Word_Count'] = complex_word_count
        output_df.at[index, 'Syllable_Per_Word'] = syllable_per_word
        output_df.at[index, 'Personal_Pronouns'] = personal_pronouns

        print(f"Text analysis completed for {file_path}")

    except Exception as e:
        print(f"Error analyzing text for {file_path}: {e}")

# Save the updated output DataFrame to a new Excel file
output_df.to_excel("text_analysis_output.xlsx", index=False)
print("Text analysis results saved to text_analysis_output.xlsx")


Text analysis completed for blackassign0001.txt
Text analysis completed for blackassign0002.txt
Text analysis completed for blackassign0003.txt
Text analysis completed for blackassign0004.txt
Text analysis completed for blackassign0005.txt
Text analysis completed for blackassign0006.txt
Text analysis completed for blackassign0007.txt
Text analysis completed for blackassign0008.txt
Text analysis completed for blackassign0009.txt
Text analysis completed for blackassign0010.txt
Text analysis completed for blackassign0011.txt
Text analysis completed for blackassign0012.txt
Text analysis completed for blackassign0013.txt
Text analysis completed for blackassign0014.txt
Text analysis completed for blackassign0015.txt
Text analysis completed for blackassign0016.txt
Text analysis completed for blackassign0017.txt
Text analysis completed for blackassign0018.txt
Text analysis completed for blackassign0019.txt
Text analysis completed for blackassign0020.txt
Text analysis completed for blackassign0