In [None]:
# Natural Language Processing - text data in form of documents(txt,pdf,doc,etc.),
# webpages, social media posts, text or transaction databases, etc.

# Text Data is Unsupervised Learning Data no structure. Vector Databases where
# text data lies.

# Biggest Challenge in text data is cleaning and preprocessing. Remove digits,
# punctuations, special characters, white spaces, emojis, hyper links,
# images, tables, etc.
# Regular Expression Patterns must be known for cleaning text. Most of these
# patterns are predefined. (re library)

# Post preprocessing Text must be converted into matrix of numbers called
# vectorization. Only when converted to numbers Algorithms can be built

# Model Building using ML Algorithms like Regression, Classification, Clustering
# Deep Learning Models(Neural Networks) like LSTM, GPT Models (Transformers or
# Encoder/Decoder Models).

# NLP Use Cases - Sentiment Analysis, Text Summarization, Auto Fill, ChatBots,
# Reviews Analysis , Generative AI(Prompt Engineering), Scraping/Extract, etc.

# Core library is nltk (natural language tool kit), other libraries are
# textblob, spacy, wordcloud, re, string, bs4, requests, etc.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import nltk # NLP Library
import re # Regular Expressions for pattern based cleaning
import requests # works with urls or html webpages

In [None]:
# Web Scraping - Scraping or extract text from webpages or using urls.
# Only open source web pages can be scraped.
# Web pages use html based language and html scrapers must be used

In [None]:
climatewiki=requests.get("https://en.wikipedia.org/wiki/Climate_change")

In [None]:
from bs4 import BeautifulSoup

In [None]:
climatedf=BeautifulSoup(climatewiki.content,"lxml")
# lxml is opensource html format used by Wikipedia
# other html formats are html5lib.

In [None]:
climatetxt=climatedf.getText(strip=True)

In [None]:
climatetxt # Preprocessing must be done for removing punctuations, digits,
# special characters, stopwords(is,a,an,the,etc.) etc.



In [None]:
# re library or regular expressions library is crucial for preprocessing
# re.sub(), re.split(), re.replace(), etc. functions predefined for text data

In [None]:
# Replace footnote references [5] with space
climatetxt=re.sub(r'\[d+\]'," ",climatetxt) # \d+ is regex pattern for digits

In [None]:
# Replace numbers in () or (2024) with space
climatetxt=re.sub(r'\(d+\)'," ",climatetxt)

In [None]:
# Replace words [and] with space
climatetxt=re.sub(r'\[w+\]'," ",climatetxt) # \w+ is regex pattern for words

In [None]:
climatetxt=re.sub(r'\(w+\)'," ",climatetxt)

In [None]:
# Remove Special Characters
climatetxt=re.sub('[\(\[].*?[\)\]]'," ",climatetxt)

![image.png](attachment:934fa509-778d-4423-acb0-2efe1715085e.png)

In [None]:
climatetxt=re.sub("\'s"," ",climatetxt)

In [None]:
climatetxt=re.sub("\xa0°C"," ",climatetxt)

In [None]:
climatetxt=re.sub("[0-9]+"," ",climatetxt)

In [None]:
# Tokenization - Breaking text/content into either sentences or words
# Sentence Tokenization - Breaking content into sentences delimiter is fullstop
# Word Tokenization - Breaking content into words delimiter is space

In [None]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
climate_sentence=sent_tokenize(climatetxt)

In [None]:
climate_sentence=pd.DataFrame(climate_sentence,columns=["sentence"])

In [None]:
# Sentiment Analysis is done using library "textblob"
# Text Blob based sentiment analysis generates 2 metrics
# 1) Polarity Score is a float between -1 to 1. Polarity is calculated by
# comparing with lexicons/lists of Positive, Negative & Neutral words.
# >0 is Positive , =0 is Neutral & <0 is Negative

# 2) Subjectivity lies between 0 and 1. Close to 1 indicates high personal
# opinion without factual information and close to 0 indicates low personal
# opinion with factual information

In [None]:
# !pip install textblob
from textblob import TextBlob

In [None]:
s1=TextBlob("tendulkar is greatest batsman in cricket")
s1.sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [None]:
s2=TextBlob("tendulkar is great batsman in cricket who scored 25000 runs")
s2.sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [None]:
def analyze_sentiment(text):
    analysis=TextBlob(text)
    if analysis.sentiment.polarity>0:
        return "Positive"
    elif analysis.sentiment.polarity==0:
        return "Neutral"
    else:
        return "Negative"

In [None]:
climate_sentence['sentiment']=[str(analyze_sentiment(x))
                               for x in climate_sentence.sentence]

In [None]:
climate_sentence['sentiment'].value_counts()

sentiment
Neutral     1678
Positive     402
Negative     197
Name: count, dtype: int64