<a href="https://colab.research.google.com/github/joynaomi81/Text-Preprocessing-in-NLP/blob/main/Text_Preprocessing_in_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('/content/drive/MyDrive/text.csv')

In [7]:
df.head() # Check for the first 5 rows

Unnamed: 0,comment_text,toxic
0,This letter perfectly illustrates why any hope...,1
1,One muslim casualty vs the hundreds and thousa...,1
2,(fuck you Osama bin laden and your afghanistan...,1
3,As long as Trump keeps Stiggin' It to the libs...,1
4,This article is a load of crap.... Another Fa...,1


In [8]:
df.tail() # Check for the last 5 rows

Unnamed: 0,comment_text,toxic
19995,i like smiley pancakes and crap on stick,0
19996,"""\n\n""""żem"""" is not equal to """"że"""". """"żem"""" ...",0
19997,"""\n\n Headlines \n\nCan you please add this co...",0
19998,"Thank You, sorry.–",0
19999,Schooling \n\nI attended Harrison Trimble in M...,0


In [9]:
# Check for the columns lables
df.columns

Index(['comment_text', 'toxic'], dtype='object')

In [10]:
df.info() # Information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_text  20000 non-null  object
 1   toxic         20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [11]:
df.describe() # Descriptive statistics of the dataset

Unnamed: 0,toxic
count,20000.0
mean,0.5
std,0.500013
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [12]:
df.shape # Check for the shape of the dataset

(20000, 2)

In [13]:
# Check for unique values
df.nunique()

Unnamed: 0,0
comment_text,19960
toxic,2


# Data cleaning

In [14]:
#check for missing values
df.isna().sum()

Unnamed: 0,0
comment_text,0
toxic,0


In [15]:
# Checking for duplicate rows in the DataFrame
df.duplicated().sum()

39

There are 39 rows in the DataFrame that are duplicates of other rows.

In [16]:
# Drop duplicates rows
df = df.drop_duplicates()


In [17]:
# Check for the new data shape
df.shape

(19961, 2)

In [18]:
df.loc[:1] # locate a specific row

Unnamed: 0,comment_text,toxic
0,This letter perfectly illustrates why any hope...,1
1,One muslim casualty vs the hundreds and thousa...,1


# Data Pre-processing

## Convert data to Lowercase

In [19]:
df['clean_text'] = df['comment_text'].str.lower()
df.sample(5)

Unnamed: 0,comment_text,toxic,clean_text
8693,Yeah but the beer they were hauling sucks........,1,yeah but the beer they were hauling sucks........
8242,"What the hell man!\nLook, I'm really pissed th...",1,"what the hell man!\nlook, i'm really pissed th..."
3123,Shot in the leg?! He needs to resight that suc...,1,shot in the leg?! he needs to resight that suc...
9219,Trumps sexual assaults are only irrelevant to ...,1,trumps sexual assaults are only irrelevant to ...
1138,I wish to thank the Marxist morons of Ontario ...,1,i wish to thank the marxist morons of ontario ...


## Removal of Punctuations

In [20]:
def remove_punctuations(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for x in text:
    if x in punctuations:
      text = text.replace(x, "")
  return text

In [21]:
df['clean_text'] = df['clean_text'].apply(remove_punctuations)
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,this letter perfectly illustrates why any hope...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs the hundreds and thousa...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck you osama bin laden and your afghanistani...
3,As long as Trump keeps Stiggin' It to the libs...,1,as long as trump keeps stiggin it to the libs ...
4,This article is a load of crap.... Another Fa...,1,this article is a load of crap another fake n...


## Removal of Stopwords

In [22]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [24]:
words = set(stopwords.words('english'))
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in words])

In [25]:
df['clean_text'] = df['clean_text'].apply(lambda a: remove_stopwords(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,article load crap another fake news poll


##  Removal of Frequent Words

In [26]:
from collections import Counter
word_count = Counter()
for text in df['clean_text'].values:
  for word in text.split():
    word_count[word] += 1

word_count.most_common(10)

[('article', 4083),
 ('like', 3222),
 ('page', 3115),
 ('would', 2904),
 ('one', 2866),
 ('dont', 2775),
 ('people', 2653),
 ('==', 2536),
 ('wikipedia', 2445),
 ('fuck', 2249)]

In [27]:
FREQ_WORDS = set([w for (w, wc) in word_count.most_common(4)])
def remove_freqwords(text):
  return " ".join([word for word in str(text).split() if word not in FREQ_WORDS])

In [28]:
df['clean_text'] = df['clean_text'].apply(lambda a: remove_freqwords(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll


In [29]:
RARE_WORDS = set([word for (word, wc) in word_count.most_common()[-20: -1]])
RARE_WORDS

{'110s10818',
 '79th',
 'cavalry',
 'cio',
 'growers',
 'jestem',
 'kodak',
 'langage',
 'moncton',
 'pancakes',
 'parmenion',
 'persepolis',
 'questioner',
 'regrupped',
 'rfx',
 'sidney',
 'sorry–',
 'thessalian',
 'xfd'}

In [30]:
def remove_rarewords(text):
  return " ".join([word for word in str(text).split() if word not in RARE_WORDS])

In [31]:
df['clean_text'] = df['clean_text'].apply(lambda a: remove_rarewords(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll


## Removal of Special Characters

In [32]:
import re
def remove_spl_chars(text):
  text = re.sub('[^a-zA-Z0-9]', ' ', text)
  text = re.sub('\s+', ' ', text)
  return text

In [33]:
df['clean_text'] = df['clean_text'].apply(lambda a: remove_spl_chars(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll


## Stemming

In [34]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
  return " ".join([stemmer.stem(word) for word in text.split()])

In [35]:
df['stemmed_text'] = df['clean_text'].apply(lambda a: stem_words(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text,stemmed_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...,letter perfectli illustr hope reconcili sheer ...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...,one muslim casualti vs hundr thousand victim m...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunt
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...,long trump keep stiggin lib palinamerican wont...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll,load crap anoth fake news poll


## Lemmatization

In [36]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmetizer = WordNetLemmatizer()
nltk.download('wordnet')
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "AV": wordnet.ADV}

def lemmatize_words(text):
  pos_tagged_text = nltk.pos_tag(text.split())
  return " ".join([lemmetizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [38]:
df['lemmatized_text'] = df['clean_text'].apply(lambda a: lemmatize_words(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text,stemmed_text,lemmatized_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...,letter perfectli illustr hope reconcili sheer ...,letter perfectly illustrate hoped reconciliati...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...,one muslim casualti vs hundr thousand victim m...,one muslim casualty v hundred thousand victim ...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunt
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...,long trump keep stiggin lib palinamerican wont...,long trump keep stiggin libs palinamericans wo...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll,load crap anoth fake news poll,load crap another fake news poll


## Removal of URLs

In [47]:
import re

def remove_urls(text):
  return re.sub(r'https?://\S+|www\.\S+', '', text) # remove urls

df['url_text'] = df['clean_text'].apply(lambda a: remove_urls(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text,stemmed_text,lemmatized_text,url_text,html_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...,letter perfectli illustr hope reconcili sheer ...,letter perfectly illustrate hoped reconciliati...,letter perfectly illustrates hoped reconciliat...,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...,one muslim casualti vs hundr thousand victim m...,one muslim casualty v hundred thousand victim ...,one muslim casualty vs hundreds thousands vict...,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...,long trump keep stiggin lib palinamerican wont...,long trump keep stiggin libs palinamericans wo...,long trump keeps stiggin libs palinamericans w...,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll,load crap anoth fake news poll,load crap another fake news poll,load crap another fake news poll,load crap another fake news poll


## Removal of HTML Tags

In [46]:
def remove_html_tags(text):
  return re.sub(r'<.*?>', '', text)

df['html_text'] = df['clean_text'].apply(lambda a: remove_html_tags(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text,stemmed_text,lemmatized_text,url_text,html_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...,letter perfectli illustr hope reconcili sheer ...,letter perfectly illustrate hoped reconciliati...,letter perfectly illustrates hoped reconciliat...,letter perfectly illustrates hoped reconciliat...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...,one muslim casualti vs hundr thousand victim m...,one muslim casualty v hundred thousand victim ...,one muslim casualty vs hundreds thousands vict...,one muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunts
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...,long trump keep stiggin lib palinamerican wont...,long trump keep stiggin libs palinamericans wo...,long trump keeps stiggin libs palinamericans w...,long trump keeps stiggin libs palinamericans w...
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll,load crap anoth fake news poll,load crap another fake news poll,load crap another fake news poll,load crap another fake news poll


## Spelling Correction

In [49]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [50]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
  corrected_text = []
  misspelled_words = spell.unknown(text.split())

In [51]:
df['spell_text'] = df['clean_text'].apply(lambda a: correct_spellings(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text,stemmed_text,lemmatized_text,url_text,html_text,spell_text
0,This letter perfectly illustrates why any hope...,1,letter perfectly illustrates hoped reconciliat...,letter perfectli illustr hope reconcili sheer ...,letter perfectly illustrate hoped reconciliati...,letter perfectly illustrates hoped reconciliat...,letter perfectly illustrates hoped reconciliat...,
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualty vs hundreds thousands vict...,one muslim casualti vs hundr thousand victim m...,one muslim casualty v hundred thousand victim ...,one muslim casualty vs hundreds thousands vict...,one muslim casualty vs hundreds thousands vict...,
2,(fuck you Osama bin laden and your afghanistan...,1,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunt,fuck osama bin laden afghanistani terrorist cunts,fuck osama bin laden afghanistani terrorist cunts,
3,As long as Trump keeps Stiggin' It to the libs...,1,long trump keeps stiggin libs palinamericans w...,long trump keep stiggin lib palinamerican wont...,long trump keep stiggin libs palinamericans wo...,long trump keeps stiggin libs palinamericans w...,long trump keeps stiggin libs palinamericans w...,
4,This article is a load of crap.... Another Fa...,1,load crap another fake news poll,load crap anoth fake news poll,load crap another fake news poll,load crap another fake news poll,load crap another fake news poll,
