In [None]:
import pandas as pd
#to display the full text on the notebook without truncation
pd.set_option('display.max_colwidth', 150)



In [None]:

docs_df = pd.DataFrame([ ['d0', 'The sun sets in the west, painting the sky with hues of orange and pink filled.'],
                        ['d1', 'Birds chirped merrily in the lush green forest, creating a symphony of nature.'],
                        ['d2', 'The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.'],
                        ['d3', 'Waves crashed against the rocky shore, sending sprays of salty water into the air.'],
                        ['d4', 'Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.'],
                        ['d5', 'The quick brown foxes are jumping over the lazy dogs.'],
                        ['d6', 'Cats running was better than the runners running']],
                        columns=["docno", "raw_text"])

In [None]:
# Cleaning functions for English

import re
from snowballstemmer import stemmer
from sklearn.feature_extraction import _stop_words as stp


In [None]:
stp.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [None]:
################################## Removing Stop Words function ########################################################

def remove_stopWords(sentence):
    terms=[]
    stopWords= set(stp.ENGLISH_STOP_WORDS)
    for term in sentence.split() :
        if term not in stopWords :
           terms.append(term)
    return " ".join(terms)


################################## Normalizing function ########################################################


def normalize(text):
  # folding text (lower case)
  lower_string = text.lower()
  # Remove punctuation and numbers
  cleaned_string = re.sub(r'[^a-zA-Z\s]', '', lower_string)
  normalized_string = ' '.join(cleaned_string.split())
  return(normalized_string)


################################### Stemming Function ###########################################################

stemmerObj = stemmer("english")  # Use "english" or another supported language

def stem(sentence):
    return " ".join([stemmerObj.stemWord(i) for i in sentence.split()])


################################### Lemmatization Function ###########################################################

import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    # Process the text using spaCy
    doc = nlp(text)

    # Lemmatize the text and remove stopwords and non-alphabetic characters
    #cleaned_text = ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha])

     # Lemmatize the text
    cleaned_text = ' '.join([token.lemma_.lower() for token in doc])

    return cleaned_text


################################################## test the functions ######################################

docs_df["nostop_text"]=docs_df["raw_text"].apply(remove_stopWords)
docs_df["NOR_text"]=docs_df["raw_text"].apply(normalize)
docs_df['text_STEM']=docs_df['raw_text'].apply(stem)
docs_df['text_lemm']=docs_df['raw_text'].apply(lemmatize_text)


display(docs_df)


Unnamed: 0,docno,raw_text,nostop_text,NOR_text,text_STEM,text_lemm
0,d0,"The sun sets in the west, painting the sky with hues of orange and pink filled.","The sun sets west, painting sky hues orange pink filled.",the sun sets in the west painting the sky with hues of orange and pink filled,"The sun set in the west, paint the sky with hue of orang and pink filled.","the sun set in the west , paint the sky with hue of orange and pink fill ."
1,d1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.","Birds chirped merrily lush green forest, creating symphony nature.",birds chirped merrily in the lush green forest creating a symphony of nature,"Bird chirp merrili in the lush green forest, creat a symphoni of nature.","bird chirp merrily in the lush green forest , create a symphony of nature ."
2,d2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.","The aroma freshly brewed coffee filled cozy cafe, welcoming patrons warmth.",the aroma of freshly brewed coffee filled the cozy cafe welcoming patrons with its warmth,"The aroma of fresh brew coffe fill the cozi cafe, welcom patron with it warmth.","the aroma of freshly brew coffee fill the cozy cafe , welcome patron with its warmth ."
3,d3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.","Waves crashed rocky shore, sending sprays salty water air.",waves crashed against the rocky shore sending sprays of salty water into the air,"Wave crash against the rocki shore, send spray of salti water into the air.","wave crash against the rocky shore , send spray of salty water into the air ."
4,d4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,Laughter echoed halls friends gathered crackling bonfire starry night painting filled.,laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled,Laughter echo through the hall as friend gather around a crackl bonfir on a starri night paint filled.,laughter echo through the hall as friend gather around a crackling bonfire on a starry night painting fill .
5,d5,The quick brown foxes are jumping over the lazy dogs.,The quick brown foxes jumping lazy dogs.,the quick brown foxes are jumping over the lazy dogs,The quick brown fox are jump over the lazi dogs.,the quick brown fox be jump over the lazy dog .
6,d6,Cats running was better than the runners running,Cats running better runners running,cats running was better than the runners running,Cat run was better than the runner run,cat run be well than the runner run


In [None]:
### Cleaning the data using stem
docs_df["Final_text"]=docs_df["raw_text"].apply(remove_stopWords)
docs_df["Final_text"]=docs_df["Final_text"].apply(normalize)
docs_df['Final_text']=docs_df['Final_text'].apply(stem)
docs_df[['raw_text','Final_text']]

Unnamed: 0,raw_text,Final_text
0,"The sun sets in the west, painting the sky with hues of orange and pink filled.",the sun set west paint sky hue orang pink fill
1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.",bird chirp merrili lush green forest creat symphoni natur
2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.",the aroma fresh brew coffe fill cozi cafe welcom patron warmth
3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.",wave crash rocki shore send spray salti water air
4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,laughter echo hall friend gather crackl bonfir starri night paint fill
5,The quick brown foxes are jumping over the lazy dogs.,the quick brown fox jump lazi dog
6,Cats running was better than the runners running,cat run better runner run


In [None]:
#cleaning data using lemma
docs_df["lemma_Final_text"]=docs_df["raw_text"].apply(remove_stopWords)
docs_df["lemma_Final_text"]=docs_df["lemma_Final_text"].apply(normalize)
docs_df['lemma_Final_text']=docs_df['lemma_Final_text'].apply(lemmatize_text)
docs_df[['raw_text','lemma_Final_text']]

Unnamed: 0,raw_text,lemma_Final_text
0,"The sun sets in the west, painting the sky with hues of orange and pink filled.",the sun set west paint sky hue orange pink fill
1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.",bird chirp merrily lush green forest create symphony nature
2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.",the aroma freshly brew coffee fill cozy cafe welcome patron warmth
3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.",wave crash rocky shore send spray salty water air
4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,laughter echo hall friend gather crackle bonfire starry night painting fill
5,The quick brown foxes are jumping over the lazy dogs.,the quick brown fox jump lazy dog
6,Cats running was better than the runners running,cat run well runner run


**Stemming** reduces words to their root or base form, often resulting in non-linguistic roots. **Lemmatization**, on the other hand, reduces words to their dictionary form (lemma), preserving the linguistic meaning. Stemming is faster but may not always produce valid words, whereas lemmatization is more accurate but can be slower.

In [15]:
def stem_preprocess(sentence):
  # apply preprocessing steps on the given sentence
  sentence =remove_stopWords(sentence)
  sentence =normalize(sentence)
  sentence =stem(sentence)
  return sentence


def lemma_preprocess(sentence):
  # apply preprocessing steps on the given sentence
  sentence =remove_stopWords(sentence)
  sentence =normalize(sentence)
  sentence =lemmatize_text(sentence)
  return sentence


docs_df["Fstem_text"]=docs_df["raw_text"].apply(stem_preprocess)
docs_df["Flemma_text"]=docs_df["raw_text"].apply(lemma_preprocess)


docs_df[['raw_text','Fstem_text','Flemma_text']]

Unnamed: 0,raw_text,Fstem_text,Flemma_text
0,"The sun sets in the west, painting the sky with hues of orange and pink filled.",the sun set west paint sky hue orang pink fill,the sun set west paint sky hue orange pink fill
1,"Birds chirped merrily in the lush green forest, creating a symphony of nature.",bird chirp merrili lush green forest creat symphoni natur,bird chirp merrily lush green forest create symphony nature
2,"The aroma of freshly brewed coffee filled the cozy cafe, welcoming patrons with its warmth.",the aroma fresh brew coffe fill cozi cafe welcom patron warmth,the aroma freshly brew coffee fill cozy cafe welcome patron warmth
3,"Waves crashed against the rocky shore, sending sprays of salty water into the air.",wave crash rocki shore send spray salti water air,wave crash rocky shore send spray salty water air
4,Laughter echoed through the halls as friends gathered around a crackling bonfire on a starry night painting filled.,laughter echo hall friend gather crackl bonfir starri night paint fill,laughter echo hall friend gather crackle bonfire starry night painting fill
5,The quick brown foxes are jumping over the lazy dogs.,the quick brown fox jump lazi dog,the quick brown fox jump lazy dog
6,Cats running was better than the runners running,cat run better runner run,cat run well runner run
