<a href="https://colab.research.google.com/github/kunalnischal7/CustomerChurnPrediction/blob/main/DataExtractionandNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Extraction

### Installing and Importing Libraries

In [1]:
import tensorflow as tf
import pandas as pd
import sklearn

In [12]:
import requests
from bs4 import BeautifulSoup
import os

## Fetch the webpages

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
%cd "/content/drive/My Drive/"

/content/drive/My Drive


In [8]:
file_path = 'Input.xlsx'

In [9]:
df = pd.read_excel(file_path)

In [10]:
urls = df['URL']

In [19]:

# Create a directory to save the text files
if not os.path.exists('article_texts'):
    os.mkdir('article_texts')

# Initialize an empty list to store the article texts
article_texts = []

for index, url in enumerate(urls):
    try:
        response = requests.get(url)

        if response.status_code == 200:
            html_content = response.text
        else:
            print(f"Failed to retrieve the webpage at URL: {url}. Status code: {response.status_code}")
            continue  # Skip to the next URL in case of an error

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract article text (modify as per your HTML structure)
        article_text = ""
        paragraphs = soup.find_all('p')
        for paragraph in paragraphs:
            article_text += paragraph.text + '\n'

        # Append the article text to the list
        article_texts.append(article_text)

        # Save the article text to a text file with URL_ID as its file name
        url_id = df.at[index, 'URL_ID']  # Assuming there is a column 'URL_ID' in the Excel file
        with open(f'article_texts/{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(article_text)

        print(f"Saved article from URL: {url} with URL_ID: {url_id}")
    except Exception as e:
        print(f"Error processing URL: {url} - {str(e)}")

print("Extraction and saving completed.")

# Now, the article_texts list contains the extracted article texts, and you can proceed with your analysis.


Saved article from URL: https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/ with URL_ID: 123.0
Saved article from URL: https://insights.blackcoffer.com/rise-of-e-health-and-its-impact-on-humans-by-the-year-2030/ with URL_ID: 321.0
Saved article from URL: https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/ with URL_ID: 2345.0
Saved article from URL: https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2/ with URL_ID: 4321.0
Saved article from URL: https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-2-2/ with URL_ID: 432.0
Saved article from URL: https://insights.blackcoffer.com/rise-of-chatbots-and-its-impact-on-customer-support-by-the-year-2040/ with URL_ID: 2893.8
Saved article from URL: https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030/ with URL_ID: 3355.6
Saved article from URL: h

In [20]:
output_structure = pd.read_excel('Output Data Structure.xlsx')

In [21]:
result_df = pd.DataFrame(columns=['POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH'])

In [17]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [23]:
import re

# Remove punctuation and split text into words
words = re.findall(r'\b\w+\b', article_text)

# Calculate word count
word_count = len(words)

In [25]:
import textstat

# Calculate FOG Index
fog_index = textstat.gunning_fog(article_text)

In [28]:
import nltk

# Download the "punkt" resource
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [29]:
import nltk

# Tokenize sentences using NLTK
sentences = nltk.sent_tokenize(article_text)

# Calculate average sentence length
avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)

In [33]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [34]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Calculate sentiment scores
sentiment_scores = analyzer.polarity_scores(article_text)
positive_score = sentiment_scores['pos']
negative_score = sentiment_scores['neg']

In [36]:
sentiment_scores = analyzer.polarity_scores(article_text)

# Extract the polarity score
polarity_score = sentiment_scores['compound']

# Determine sentiment based on the polarity score
if polarity_score > 0.05:
    sentiment = 'Positive'
elif polarity_score < -0.05:
    sentiment = 'Negative'
else:
    sentiment = 'Neutral'

# Print the results
print(f"Polarity Score: {polarity_score}")
print(f"Sentiment: {sentiment}")

Polarity Score: 0.9994
Sentiment: Positive


In [43]:
from textblob import TextBlob

# Create a TextBlob object for sentiment analysis
blob = TextBlob(article_text)

# Calculate the subjectivity score (ranges from 0 to 1)
subjectivity_score = blob.sentiment.subjectivity

# Print the result
print(f"Subjectivity Score: {subjectivity_score}")

Subjectivity Score: 0.4334609379691348


In [42]:
words = article_text.split()

# Initialize a counter for complex words
complex_word_count = 0

# Iterate through the words and count complex words
for word in words:
    # Calculate the number of syllables in the word
    syllable_count = textstat.syllable_count(word)

    # Define a threshold for syllables (e.g., words with 3 or more syllables are considered complex)
    complexity_threshold = 3

    # Check if the word is complex based on the threshold
    if syllable_count >= complexity_threshold:
        complex_word_count += 1

# Calculate the percentage of complex words
total_words = len(words)
percentage_of_complex_words = (complex_word_count / total_words) * 100

# Print the result
print(f"Percentage of Complex Words: {percentage_of_complex_words:.2f}%")

Percentage of Complex Words: 16.18%


In [46]:
sentences = article_text.split('.')
avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)

In [50]:
words_per_sentence = [len(nltk.word_tokenize(sentence)) for sentence in sentences]

# Calculate the average words per sentence
avg_words_per_sentence = sum(words_per_sentence) / len(sentences)

# Print the result
print(f"Average Words Per Sentence: {avg_words_per_sentence:.2f}")

Average Words Per Sentence: 22.47


In [54]:
total_syllables = sum(textstat.syllable_count(word) for word in article_text.split())
syllable_per_word = total_syllables / word_count

In [56]:
personal_pronouns = sum(1 for word in article_text.split() if word.lower() in ['i', 'me', 'my', 'mine', 'myself'])

In [57]:
avg_word_length = sum(len(word) for word in article_text.split()) / word_count

In [58]:
import textstat  # You'll need the 'textstat' library for some of the calculations

# Define the DataFrame with the specified columns
result_df = pd.DataFrame(columns=['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                                  'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                                  'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

# Assuming you have the article texts in a list called article_texts
for index, article_text in enumerate(article_texts):
    # Perform textual analysis and calculate the variables here
    # You will need to use appropriate libraries and functions for each calculation

    # Example: Calculate word count
    word_count = len(article_text.split())

    # Example: Calculate Flesch-Kincaid Grade Level (FOG) Index
    fog_index = textstat.gunning_fog(article_text)

    # Example: Calculate average sentence length
    # You may need to use NLTK or other libraries for more advanced analysis
    # Sentence tokenization can be done with NLTK's sent_tokenize function
    sentences = article_text.split('.')
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)

    # Add the results to the DataFrame
    result_df.loc[index] = [positive_score, negative_score, polarity_score, subjectivity_score,
                            avg_sentence_length, percentage_of_complex_words, fog_index,
                            avg_words_per_sentence, complex_word_count, word_count,
                            syllable_per_word, personal_pronouns, avg_word_length]

# Save the results to an Excel file
result_df.to_excel('output_results.xlsx', index=False)
