In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [6]:
from sklearn.model_selection import train_test_split

X = data['text']  # Your feature (text data)
y = data['label']  # Your target (label: SPAM or HAM)

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the resulting splits
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 800
Test set size: 200


## Data Preprocessing

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [13]:
from bs4 import BeautifulSoup
import re
# Function to clean the HTML content
def clean_html(text):
    # 1. Remove inline JavaScript and CSS
    text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL)  # Remove <script> tags
    text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL)    # Remove <style> tags

    # 2. Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)  # Remove comments (<!-- -->)

    # 3. Remove remaining HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text()  # Extract text without tags

    return cleaned_text

# Example usage on your dataset (assuming 'data' is your DataFrame with a 'text' column)
data['cleaned_text'] = data['text'].apply(clean_html)

# Now, `data['cleaned_text']` will contain the HTML-free cleaned text
print(data['cleaned_text'].head())



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, "html.parser")


0    DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...
1                                             Will do.
2    Nora--Cheryl has emailed dozens of memos about...
3    Dear Sir=2FMadam=2C I know that this proposal ...
4                                                  fyi
Name: cleaned_text, dtype: object


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [14]:
def clean_text(text):
    # 1. Remove all special characters (keeping only alphabets and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 2. Remove all numbers
    text = re.sub(r'\d+', '', text)

    # 3. Remove all single characters (but keep important short words, e.g., "a", "I")
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # 4. Remove single characters from the start of the text
    text = re.sub(r'^\b[a-zA-Z]\b', '', text)

    # 5. Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # 6. Remove any prefixed 'b' (for byte literals like b'abc')
    text = re.sub(r"^b'", '', text)

    # 7. Convert to lowercase
    text = text.lower()

    # Strip any extra leading/trailing spaces
    text = text.strip()

    return text

# Example usage on your dataset (assuming 'data' is your DataFrame with a 'text' column)
data['cleaned_text'] = data['text'].apply(clean_text)

# Check the cleaned text
print(data['cleaned_text'].head())


0    dear sir strictly private business proposal am...
1                                              will do
2    noracheryl has emailed dozens of memos about h...
3    dear sirfmadamc know that this proposal might ...
4                                                  fyi
Name: cleaned_text, dtype: object


## Now let's work on removing stopwords
Remove the stopwords.

In [15]:
from nltk.corpus import stopwords
# Download stopwords if not already available
nltk.download('stopwords')

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

def clean_text_with_stopwords(text):
    # 1. Remove all special characters (keeping only alphabets and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 2. Remove all numbers
    text = re.sub(r'\d+', '', text)

    # 3. Remove all single characters (but keep important short words, e.g., "a", "I")
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # 4. Remove single characters from the start of the text
    text = re.sub(r'^\b[a-zA-Z]\b', '', text)

    # 5. Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # 6. Remove any prefixed 'b' (for byte literals like b'abc')
    text = re.sub(r"^b'", '', text)

    # 7. Convert to lowercase
    text = text.lower()

    # 8. Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Strip any extra leading/trailing spaces
    text = text.strip()

    return text

# Example usage on your dataset (assuming 'data' is your DataFrame with a 'text' column)
data['cleaned_text'] = data['text'].apply(clean_text_with_stopwords)

# Check the cleaned text after removing stopwords
print(data['cleaned_text'].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    dear sir strictly private business proposal mi...
1                                                     
2    noracheryl emailed dozens memos haiti weekend ...
3    dear sirfmadamc know proposal might surprise e...
4                                                  fyi
Name: cleaned_text, dtype: object


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [16]:
from nltk.stem import WordNetLemmatizer


# Download necessary NLTK data if not already available
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For multilingual support in WordNet

# Initialize the lemmatizer and get the stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text_with_lemmatization(text):
    # 1. Remove all special characters (keeping only alphabets and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 2. Remove all numbers
    text = re.sub(r'\d+', '', text)

    # 3. Remove all single characters (but keep important short words, e.g., "a", "I")
    text = re.sub(r'\b[a-zA-Z]\b', '', text)

    # 4. Remove single characters from the start of the text
    text = re.sub(r'^\b[a-zA-Z]\b', '', text)

    # 5. Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # 6. Remove any prefixed 'b' (for byte literals like b'abc')
    text = re.sub(r"^b'", '', text)

    # 7. Convert to lowercase
    text = text.lower()

    # 8. Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # 9. Lemmatization - Reduce words to their base form
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    # Strip any extra leading/trailing spaces
    text = text.strip()

    return text

# Example usage on your dataset (assuming 'data' is your DataFrame with a 'text' column)
data['cleaned_text'] = data['text'].apply(clean_text_with_lemmatization)

# Check the cleaned text after lemmatization
print(data['cleaned_text'].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


0    dear sir strictly private business proposal mi...
1                                                     
2    noracheryl emailed dozen memo haiti weekend pl...
3    dear sirfmadamc know proposal might surprise e...
4                                                  fyi
Name: cleaned_text, dtype: object


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [17]:
import nltk
import matplotlib.pyplot as plt
from collections import Counter

# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return words

# Load data and process text
data_train = pd.read_csv('kg_train.csv').fillna('')
data_test = pd.read_csv('kg_test.csv').fillna('')

ham_train = data_train[data_train['label'] == 'ham']['text'].apply(clean_and_tokenize)
spam_train = data_train[data_train['label'] == 'spam']['text'].apply(clean_and_tokenize)

ham_words = [word for msg in ham_train for word in msg]
spam_words = [word for msg in spam_train for word in msg]

# Count frequencies and get top 10 most common words
top_10_ham = Counter(ham_words).most_common(10)
top_10_spam = Counter(spam_words).most_common(10)

def plot_top_words(top_words, title):
    if top_words:
        words, frequencies = zip(*top_words)
        plt.figure(figsize=(10, 5))
        plt.barh(words, frequencies, color='skyblue')
        plt.xlabel('Frequency')
        plt.title(title)
        plt.gca().invert_yaxis()
        plt.show()

plot_top_words(top_10_ham, 'Top 10 Words in Ham Messages (Train)')
plot_top_words(top_10_spam, 'Top 10 Words in Spam Messages (Train)')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x))

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x))

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents (corpus)
documents = [
    "I love programming",
    "Programming is fun",
    "I love fun programming"
]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents into a matrix
X = vectorizer.fit_transform(documents)

# Convert to a dense array to view the result
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Bag of Words Representation:\n", X.toarray())


Vocabulary: ['fun' 'is' 'love' 'programming']
Bag of Words Representation:
 [[0 0 1 1]
 [1 1 0 1]
 [1 0 1 1]]


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample data
documents = [
    "I love programming",
    "Programming is fun",
    "I love fun programming"
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into a TF-IDF matrix
X = vectorizer.fit_transform(documents)

# Print the shape of the TF-IDF matrix (documents x features)
print("Shape of TF-IDF matrix:", X.shape)

# If you want to view the actual TF-IDF values as an array
print("TF-IDF Matrix:\n", X.toarray())

# Vocabulary of terms used in the TF-IDF matrix
print("Vocabulary:", vectorizer.get_feature_names_out())


Shape of TF-IDF matrix: (3, 4)
TF-IDF Matrix:
 [[0.         0.         0.78980693 0.61335554]
 [0.54783215 0.72033345 0.         0.42544054]
 [0.61980538 0.         0.61980538 0.48133417]]
Vocabulary: ['fun' 'is' 'love' 'programming']


## And the Train a Classifier?

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.datasets import load_files

# Step 1: Prepare the Data
X = data_train['text']  # Text data
y = data_train['label']  # Labels (ham/spam)

# Step 2: Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Vectorize the Text Data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Train a Classifier (Logistic Regression as an example)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

# Step 5: Evaluate the Classifier
y_pred = classifier.predict(X_test_tfidf)

# Print Evaluation Metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9748603351955307
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1002
           1       1.00      0.94      0.97       788

    accuracy                           0.97      1790
   macro avg       0.98      0.97      0.97      1790
weighted avg       0.98      0.97      0.97      1790



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [27]:
from sklearn.naive_bayes import MultinomialNB

# Load datasets
data_train = pd.read_csv("kg_train.csv")
data_test = pd.read_csv("kg_test.csv")

# Fill missing values
data_train['text'].fillna('', inplace=True)
data_test['text'].fillna('', inplace=True)

# Splitting train data for evaluation
X_train, X_val, y_train, y_val = train_test_split(data_train['text'], data_train['label'], test_size=0.2, random_state=42)

# Feature extraction methods
vectorizers = {
    "BoW (Unigram)": CountVectorizer(stop_words='english', max_features=5000),
    "TF-IDF": TfidfVectorizer(stop_words='english', max_features=5000),
    "BoW (Bigram)": CountVectorizer(ngram_range=(2,2), stop_words='english', max_features=5000),
    "BoW (Trigram)": CountVectorizer(ngram_range=(3,3), stop_words='english', max_features=5000),
    "BoW (Unigram + Bigram)": CountVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000),
    "TF-IDF (Bigram)": TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
}

# Store results
results = []

for name, vectorizer in vectorizers.items():
    # Transform data
    X_train_vect = vectorizer.fit_transform(X_train)
    X_val_vect = vectorizer.transform(X_val)

    # Train MultinomialNB
    model = MultinomialNB()
    model.fit(X_train_vect, y_train)

    # Predictions
    y_pred = model.predict(X_val_vect)

    # Evaluate
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)

    # Store results
    results.append({
        "Feature Representation": name,
        "Accuracy": accuracy,
        "Precision (Spam)": report['1']['precision'],
        "Recall (Spam)": report['1']['recall'],
        "F1-Score (Spam)": report['1']['f1-score']
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display results
print(results_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_train['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_test['text'].fillna('', inplace=True)


   Feature Representation  Accuracy  Precision (Spam)  Recall (Spam)  \
0           BoW (Unigram)  0.970662          0.954887       0.978805   
1                  TF-IDF  0.981559          0.980658       0.976879   
2            BoW (Bigram)  0.970662          0.982072       0.949904   
3           BoW (Trigram)  0.939648          1.000000       0.861272   
4  BoW (Unigram + Bigram)  0.972339          0.970930       0.965318   
5         TF-IDF (Bigram)  0.978206          0.986193       0.963391   

   F1-Score (Spam)  
0         0.966698  
1         0.978764  
2         0.965720  
3         0.925466  
4         0.968116  
5         0.974659  
