In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
#pd.set_option('display' max) -- visualize all content

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [3]:
# %pip install pandas
#pip install matplotlib
# %pip install scikit-learn

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [5]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [6]:
# Your code
data

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0
...,...,...
995,So what's the latest? It sounds contradictory ...,0
996,"TRANSFER OF 36,759,000.00 MILLION POUNDS TO YO...",1
997,Barb I will call to explain. Are you back in t...,0
998,Yang on travelNot free tonite.May work tomorrow,0


In [7]:
test = pd.read_csv("../data/kg_test.csv",encoding='latin-1')
test = test.head(200) #20% of the reduced train dataset
test.fillna("", inplace=True)
print(test.shape)

(200, 1)


In [8]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


### Let's divide the training and test set into two partitions

## Data Preprocessing

In [9]:
#%pip install nltk

In [10]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')
import re
print(re.escape(string.punctuation))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']
!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [11]:
#%pip install bs4

In [12]:
from bs4 import BeautifulSoup

def remove_html_comments(text):
    return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)

# Function to remove tags
def remove_tags(html):
    html = remove_html_comments(html)

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

In [13]:
data_nocode = data['text'].apply(remove_tags)
test_nocode = test['text'].apply(remove_tags)


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(html, "html.parser")


- [^a-zA-Z0-9\s]: This is a character set.-
- ^: When used inside [], it negates the set, meaning "match any character NOT in this set."
- a-zA-Z: Matches all lowercase and uppercase English letters.
- 0-9: Matches all digits.
- \s: Matches any whitespace character (spaces, tabs, newlines).
- [^a-zA-Z0-9\s] matches any character that is NOT an alphanumeric character and NOT a whitespace character.
- [^a-zA-Z\s] - keeps only letters and spaces

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [14]:
def clean_text(text):
    import re
    # Remove special characters (keep only letters, numbers, and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove all single characters
    text = re.sub(r'\b\w\b', '', text)
    
    # Remove single characters from the start
    text = re.sub(r'^\w\s+', '', text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove prefixed 'b'
    text = re.sub(r'^b\s*', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

In [15]:
data_cleaned = data_nocode.apply(clean_text)

In [16]:
print(data_nocode.head(20))
print(data_cleaned.head(20))

0     DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...
1                                              Will do.
2     Nora--Cheryl has emailed dozens of memos about...
3     Dear Sir=2FMadam=2C I know that this proposal ...
4                                                   fyi
5     sure -- bottom line - you need a special secur...
6     Dear Sir,I am Engr. Ugo Nzego with the Enginee...
7     Abedin Huma Saturday November 28 2009 5:14 PMH...
8     There is an Oct 16th George Marshall event at ...
9     1 25% for you as the account owner 2 65% for I...
10    STRONG> http://www.cnn.com/2003/WORLD/africa/0...
11    Dear Friend,My name is Edward Moore QC.Princip...
12    Compliment, How are you doing today, Hope you ...
13                                        Who wrote it?
14    accident. On further investigation, I found ou...
15                    Email from EricBackground for you
16    FROM:DESK OF THE CHAIRMAN AWARD OIL COMMITTEE....
17    ("REMITTANCE OF $15 MILLION U.S.A DOLLARS 

## Now let's work on removing stopwords
Remove the stopwords.

In [17]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/work/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Your code
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')  # Uncomment if stopwords are not downloaded
# nltk.download('wordnet')    # Uncomment if wordnet is not downloaded
# nltk.download('omw-1.4')    # Uncomment for wordnet data

stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [27]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered)

# Example usage:
data_nostopwords = data_cleaned.apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/work/nltk_data...


True

In [32]:
# Your code
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

# Example usage:
data_lemmatized = data_nostopwords.apply(lemmatize_text)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [33]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Fit and transform the lemmatized data
X_bow = vectorizer.fit_transform(data_lemmatized)

# To see the shape (documents x unique words)
print(X_bow.shape)

# To see the feature names (words)
print(vectorizer.get_feature_names_out()[:20])  # Show first 20 words

(1000, 19018)
['aac' 'aaclocated' 'aae' 'aag' 'aaronovitchon' 'abacha' 'abachabefore'
 'abachac' 'abachace' 'abachaco' 'abachacthe' 'abachactransferred'
 'abachae' 'abachakanonigeriabusiness' 'abachas' 'aback' 'abacked' 'abad'
 'abandon' 'abandoned']


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [34]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Fit and transform the lemmatized data
X_bow = vectorizer.fit_transform(data_lemmatized)

# To see the shape (documents x unique words)
print(X_bow.shape)

# To see the feature names (words)
print(vectorizer.get_feature_names_out()[:20])  # Show first 20 words

(1000, 19018)
['aac' 'aaclocated' 'aae' 'aag' 'aaronovitchon' 'abacha' 'abachabefore'
 'abachac' 'abachace' 'abachaco' 'abachacthe' 'abachactransferred'
 'abachae' 'abachakanonigeriabusiness' 'abachas' 'aback' 'abacked' 'abad'
 'abandon' 'abandoned']


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [35]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data_lemmatized)
feature_names = vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

for i, doc_tfidf in enumerate(tfidf_array):
    print(f"Document {i+1}:")
    for j, score in enumerate(doc_tfidf):
        if score > 0:
            print(f"  {feature_names[j]}: {score:.4f}")
    print()

Document 1:
  abandoned: 0.0671
  account: 0.1363
  address: 0.0427
  agreed: 0.1149
  agricultural: 0.0875
  along: 0.0548
  also: 0.0371
  amount: 0.0456
  appeared: 0.0901
  applies: 0.0627
  appropriate: 0.0735
  approval: 0.0600
  ask: 0.0563
  bank: 0.1770
  banking: 0.0546
  belongs: 0.0671
  bill: 0.0561
  business: 0.1341
  came: 0.0528
  carry: 0.0660
  child: 0.0558
  chukwu: 0.2064
  claim: 0.1697
  colleague: 0.0575
  come: 0.0795
  commence: 0.0726
  completion: 0.0754
  confidential: 0.0489
  contact: 0.0360
  cooperation: 0.0530
  coordinate: 0.0974
  country: 0.0705
  course: 0.0561
  crash: 0.0541
  customer: 0.0580
  dear: 0.0422
  death: 0.0444
  deceased: 0.0526
  department: 0.2462
  designated: 0.0776
  died: 0.0409
  discovered: 0.1020
  discoveryi: 0.1032
  distant: 0.0901
  dollar: 0.0370
  email: 0.0389
  enable: 0.0486
  enclose: 0.1032
  entire: 0.0604
  ever: 0.0640
  exchange: 0.0677
  expecting: 0.0665
  expences: 0.0875
  family: 0.0820
  fax: 0.0431
  

## And the Train a Classifier?

In [None]:
# Your code

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code