# NLP - Tokenization, Lemmatization and Stemming

## By: Idan Dunsky and Yaniv Kaveh Shtul


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import requests
import time
import nltk
import spacy

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

file_encoding = 'latin-1' 
df = pd.read_csv('spam.csv',encoding=file_encoding)
df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/idandunsky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/idandunsky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/idandunsky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## *Statistical Data*

In [3]:
num_of_msg = df.shape[0]

print("Total number of sms: ", num_of_msg)
print("Number of spam messages: ",df["v1"].value_counts()["spam"])
print("Number of ham messages: ",df["v1"].value_counts()["ham"])


Total number of sms:  5572
Number of spam messages:  747
Number of ham messages:  4825


In [4]:
word_count = sum([len(x.split(" ")) for x in df["v2"]])
avg_num_of_words = word_count / num_of_msg

print("Avarage number of words per message: ",avg_num_of_words)

Avarage number of words per message:  15.60678391959799


In [5]:
# Function to process text
def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum()]
    return filtered_tokens

all_words = []
for text in df['v2']:
    all_words.extend(preprocess(text))

word_freq = Counter(all_words)
most_common_words = word_freq.most_common(5)

print("The 5 most common words are: ")

for i, word in enumerate(most_common_words):
    print(f'{i+1}. "{word[0]}", count: {word[1]}')

The 5 most common words are: 
1. "i", count: 2900
2. "to", count: 2241
3. "you", count: 2228
4. "a", count: 1423
5. "the", count: 1324


In [6]:
words_with_one_occurrence = [word for word, count in word_freq.items() if count == 1]
num_words_with_one_occurrence = len(words_with_one_occurrence)

print("Number of words that only appear once: ", num_words_with_one_occurrence)


Number of words that only appear once:  4077


## *Text Processing*

#### Tokenization

* nltk

In [7]:
def nltk_tok(l):

    all_words_nltk = []
    
    for text in l:
        all_words_nltk.extend(word_tokenize(text))

    return [token.lower() for token in all_words_nltk if token.lower() not in stop_words and token.isalpha()]

In [8]:
from nltk.tokenize import word_tokenize

start_time = time.time()

filtered_words_nltk = nltk_tok(df['v2'])

total_time =  time.time() - start_time

print("The time complexity of nltk tokenization is: ", total_time)

The time complexity of nltk tokenization is:  0.2602710723876953


* spaCy

In [9]:
def spacy_tok(l):
    remove_punctuation_from_list(l)
    
    all_words_spacy = []
    
    for text in l:
        all_words_spacy.extend(tokenizer(text))

    return [token.text.lower() for token in all_words_spacy if not token.is_stop and not token.is_punct]


def remove_punctuation_from_list(words):
    
    text = ' '.join(words)
    doc = nlp(text)
    
    return [token.text for token in doc if not token.is_punct]

In [10]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
tokenizer = Tokenizer(nlp.vocab)

start_time = time.time()

filtered_words_spacy = spacy_tok(df['v2'])

total_time =  time.time() - start_time

print("The time complexity of spaCy tokenization is: ", total_time)

The time complexity of spaCy tokenization is:  0.8161511421203613


#### Lemmatization

* nltk

In [11]:
def lem_nltk(l):
    return [lemmatizer.lemmatize(word) for word in l]

In [12]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

start_time = time.time()

lemmitaized_nltk = lem_nltk(filtered_words_nltk)

total_time =  time.time() - start_time

print("The time complexity of nltk lemmatization is: ", total_time)

The time complexity of nltk lemmatization is:  0.5794250965118408


* spaCy

In [13]:
def sp_lemmatizer(text):
    
    lemmatized_spacy = []
   
    for word in text:
        lemmatized_spacy.extend(nlp(word))
        
    return [token.lemma_ for token in lemmatized_spacy]

In [None]:

nlp = spacy.load('en_core_web_sm')

start_time = time.time()
        
lemmatized_words = sp_lemmatizer(filtered_words_spacy)

total_time = time.time() - start_time

print("The time complexity of spacy lemmatization is: ", total_time)

#### Stemming

* nltk

In [None]:
def stemin_nltk(l):
    return [snowball.stem(word) for word in l]

In [None]:
from nltk.stem.snowball import SnowballStemmer

snowball = SnowballStemmer(language="english")

start_time = time.time()

stem_nltk = stemin_nltk(lemmitaized_nltk)

total_time =  time.time() - start_time

print("The time complexity of nltk stemming is: ", total_time)

* spaCy

In [None]:
# spaCy doesn't provide stemming tools

#### Conclutions

* Output Format: spaCy generally provides more user-friendly and integrated outputs, while NLTK's outputs often require additional processing.
* Processing Speed: spaCy is way slower than NLTK, particularly for large datasets.
* Language Support: spaCy has broader and more robust language support with pre-trained models available for multiple languages, while NLTK's language support is more limited and less comprehensive.

#### Statistics on the new data

In [None]:
def most_freq(l):
        word_freq = Counter(l)
        most_common_words = word_freq.most_common(5)
        return most_common_words

In [None]:
def num_of_words_with_one_occurrence(l):
        word_freq = Counter(l)
        words_with_one_occurrence = [word for word, count in word_freq.items() if count == 1]
        num_words_with_one_occurrence = len(words_with_one_occurrence)

        return num_words_with_one_occurrence

* nltk

In [None]:
print(f'There are {len(stem_nltk)} words in nltk tokens list')

print("Most 5 common words in lntk: \n")

for i, word in enumerate(most_freq(stem_nltk)):
    print(f'{i+1}. "{word[0]}", count: {word[1]}')

print("\nNumber of words that only appear once in lntk: ", num_of_words_with_one_occurrence(stem_nltk))

* spaCy

In [None]:
print(f'\nThere are {len(filtered_words_spacy)} words in spaCy tokens list')

print("Most 5 common words in spacy: \n")

for i, word in enumerate(most_freq(filtered_words_spacy)):
    print(f'{i+1}. "{word[0]}", count: {word[1]}')

print("\nNumber of words that only appear once in spaCy: ", num_of_words_with_one_occurrence(filtered_words_spacy))

## Web Scraping

In [None]:
# Specify the URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/Natural_language_processing'

# Send a GET request to fetch the raw HTML content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the main content text
    # Wikipedia's main content is typically within <div> tags with the 'mw-parser-output' class
    content_div = soup.find('div', class_='mw-parser-output')
    
    # Initialize an empty list to hold all text content
    all_text = []

    # Extract text from various elements
    for element in content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote']):
        all_text.append(element.get_text())

    # Combine all text into a single string
    wiki_text = '\n'.join(all_text)

In [None]:
tok_wiki = nltk_tok(wiki_text.split("\n"))
lem_wiki = lem_nltk(tok_wiki)
stem_wiki = stemin_nltk(lem_wiki)

wiki_words = [y for x in wiki_text.split("\n") for y in x.split(" ")]

#### Word Statistics

* Before processing

In [None]:
print(f'There are {len(wiki_words)} words in the wikipedia page before processing')

print("Most 5 common words in the wikipedia page: \n")

for i, word in enumerate(most_freq(wiki_words)):
    print(f'{i+1}. "{word[0]}", count: {word[1]}')

print("\nNumber of words that only appear once in the wikipedia page: ", num_of_words_with_one_occurrence(wiki_words))

* After processing

In [None]:
print(f'There are {len(lem_wiki)} words in the wikipedia page after processing')

print("Most 5 common words in the wikipedia page: \n")

for i, word in enumerate(most_freq(lem_wiki)):
    print(f'{i+1}. "{word[0]}", count: {word[1]}')

print("\nNumber of words that only appear once in the wikipedia page: ", num_of_words_with_one_occurrence(lem_wiki))