# **Handling Text Data**

###  **Aim:** Program to perform stemming, lemmatization, and find synonyms and antonyms of words.

In [35]:
# Importing the required libraries
import pandas as pd
import nltk
import re
import string
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet,stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
# Scraping a paragraph from wikipedia
url='https://en.wikipedia.org/wiki/Natural_disaster'
r=requests.get(url)
soup=BeautifulSoup(r.content,'html.parser')
content=soup.find('div',{'id':'mw-content-text'})
paragraphs=content.find_all('p')
with open('natural_disaster.txt','w') as file:
  for paragraph in paragraphs:
    file.write(paragraph.get_text())

In [22]:
# Preprocessing the text
def preprocess(text):
  # Converting to lower case
  text=text.lower()
  # Removing digits and special characters
  text=re.sub(r'[^\w\s]','',text)
  text=re.sub(r'[^a-zA-Z\s]','',text)
  text=re.sub(r'\d+','',text)
  # Tokenizing the words
  words=text.split()
  stop_words=set(stopwords.words('english'))
  filtered_words=[word for word in words if word not in stop_words]
  preprocessed_text=' '.join(filtered_words)
  return preprocessed_text

In [23]:
# Applying the function to the scraped text
with open('natural_disaster.txt','r') as file:
  text=file.read()
  preprocessed_text=preprocess(text)

In [24]:
# Saving the preprocessed text
with open('preprocessed_text.txt','w') as file:
  file.write(preprocessed_text)

In [25]:
# Displaying the preprocessed text
with open('preprocessed_text.txt','r') as file:
  text=file.read()
  print(text)



In [37]:
# Reading the processed text file
with open('preprocessed_text.txt','r') as file:
  text=file.read()

# Tokenizing the text into words and keep only unique words
unique_words= set(word_tokenize(text))

# Initiliazing stemmer and lemmatizer
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

# To find synonyms and antonyms of a word
def synonyms_antonyms(words):
  synonyms=set()
  antonyms=set()
  for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
      synonymns.add(lemma.name())
      if lemma.antonyms():
        antonyms.add(lemma.antonyms()[0].name())
  return synonyms,antonyms

In [38]:
# Creating a df to display the results
word_list = []
lemmatized_list = []
stemmed_list = []
antonyms_list = []
synonyms_list = []
# Process each word
for word in unique_words:
    # Lemmatize the word
    lemmatized_word = lemmatizer.lemmatize(word)

    # Stem the word
    stemmed_word = stemmer.stem(word)

    # Get synonyms and antonyms of the word
    synonyms, antonyms = get_synonyms_antonyms(word)

    # Append data to lists
    word_list.append(word)
    lemmatized_list.append(lemmatized_word)
    stemmed_list.append(stemmed_word)
    antonyms_list.append(antonyms)
    synonyms_list.append(synonyms)

# Create DataFrame
df = pd.DataFrame({
    'Word': word_list,
    'Lemmatized': lemmatized_list,
    'Stemmed': stemmed_list,
    'Antonyms': antonyms_list,
    'Synonyms': synonyms_list
})

In [40]:
# Displaying the dataframe
df.head(20)

Unnamed: 0,Word,Lemmatized,Stemmed,Antonyms,Synonyms
0,could,could,could,{},{}
1,touches,touch,touch,{},"{skin_senses, meet, disturb, touch_modality, t..."
2,manifest,manifest,manifest,{},"{attest, evidence, evident, demonstrate, certi..."
3,driving,driving,drive,{attract},"{tug, repel, driving, get, impulsive, take, dr..."
4,sufficient,sufficient,suffici,{insufficient},{sufficient}
5,citizens,citizen,citizen,{noncitizen},{citizen}
6,psychological,psychological,psycholog,{},{psychological}
7,infants,infant,infant,{},"{infant, babe, baby}"
8,cope,cope,cope,{},"{make_out, make_do, cope, coping, manage, get_..."
9,effect,effect,effect,{},"{outcome, consequence, upshot, essence, burden..."


## **Conclusion**
The paragraph retrieved from Wikipedia underwent preprocessing. During this process, the words were transformed into their root forms, and their respective synonyms and antonyms were identified. These results were then presented using a DataFrame