<a href="https://colab.research.google.com/github/ilakshmiteja/Text_Preprocessing/blob/main/Single_Function_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import regex as re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
snbstemmer = SnowballStemmer('english')
lemmat = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
doc_1 = '''I’m a <b>mother pheasant plucker</b>, I pluck mother pheasants.
I’m the most pleasant mother pheasant plucker to ever pluck a mother pheasant.
I’m not the pheasant plucker I’m the pheasant plucker’s wife, I’ve been plucking Mother pheasants my whole pheasant plucking life.
I’m not the pheasant plucker I’m the pheasant plucker’s mate, I’m only plucking Pheasants ’cause the pheasant plucker’s late.'''

doc_2 = '''Peter 1324-3434 Piper picked a peck of pickled peppers.
A peck of pickled peppers Peter Piper picked.
If Peter Piper picked a peck of pickled peppers?
Where’s the peck of pickled peppers Peter Piper picked?'''

doc_3 = '''Silly Sally swiftly shooed seven silly sheep.
The seven silly sheep Silly Sally shooed
Shilly-shallied south.
These sheep shouldn’t sleep in a shack;
Sheep should sleep in a shed.'''

doc_4 = '''All www.google.com I want is a proper cup of coffee,
Made in a proper copper coffee pot
I may be off my dot
But I want a cup of coffee
From a proper coffee pot.'''

doc_5 = '''How much wood would a woodchuck chuck if a woodchuck could chuck wood?
He would chuck, he would, as much as he could, and chuck as much wood,
as a woodchuck would if a woodchuck could chuck wood.'''

In [3]:
text_df_original = pd.DataFrame({'docs':[doc_1,doc_2,doc_3,doc_4,doc_5]})

In [4]:
text_df  = text_df_original.copy()

In [5]:
def text_clean(docs,stem_or_lem):
  # converting to uniform case - preferrably lower
  case_lower = docs.lower()

  # removing html tags
  html_removed = re.sub(r'<.*?>',' ',case_lower)

  # removing urls
  urls_removed = re.sub(r'https?://\S+|www\.S\+',' ',html_removed)

  # removing newline characters
  newline_removed = re.sub(r'\n',' ',urls_removed)

  # removing special characters
  special_removed = re.sub(r'[^a-zA-Z]',' ',newline_removed)

  # removing stopwords
  cleaned_docs = [words for words in special_removed.split() if words not in stopwords.words("english")]

  # Applying Stemming or Lemmatization
  if stem_or_lem.lower() == 'stem':
    final_doc = [snbstemmer.stem(ele) for ele in cleaned_docs]
  else:
    final_doc = [lemmat.lemmatize(ele) for ele in cleaned_docs]

  return ' '.join(final_doc)


In [6]:
text_df['text_stemmer'] = text_df['docs'].apply(text_clean,stem_or_lem = 'stem')

In [7]:
text_df['text_lemmatizer'] = text_df['docs'].apply(text_clean,stem_or_lem = 'lemmatize')

In [8]:
text_df

Unnamed: 0,docs,text_stemmer,text_lemmatizer
0,"I’m a <b>mother pheasant plucker</b>, I pluck ...",mother pheasant plucker pluck mother pheasant ...,mother pheasant plucker pluck mother pheasant ...
1,Peter 1324-3434 Piper picked a peck of pickled...,peter piper pick peck pickl pepper peck pickl ...,peter piper picked peck pickled pepper peck pi...
2,Silly Sally swiftly shooed seven silly sheep.\...,silli salli swift shoo seven silli sheep seven...,silly sally swiftly shooed seven silly sheep s...
3,All www.google.com I want is a proper cup of c...,www googl com want proper cup coffe made prope...,www google com want proper cup coffee made pro...
4,How much wood would a woodchuck chuck if a woo...,much wood would woodchuck chuck woodchuck coul...,much wood would woodchuck chuck woodchuck coul...
