In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {train_data.shape[0]} samples")
print(f"Test set size: {test_data.shape[0]} samples")


## Data Preprocessing

In [None]:
import re
from bs4 import BeautifulSoup

def clean_html(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style.*?</style>', '', text, flags=re.DOTALL)
    
    # إزالة تعليقات HTML
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # إزالة جميع وسوم HTML المتبقية
    text = BeautifulSoup(text, "html.parser").get_text()
    
    return text

# تطبيق التنظيف على النصوص في عمود 'text' من البيانات
data['cleaned_text'] = data['text'].apply(clean_html)

# عرض بعض النصوص قبل وبعد التنظيف
print(data[['text', 'cleaned_text']].head())


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
import re
from bs4 import BeautifulSoup

def clean_html(text):
    if not isinstance(text, str):
        return ""

    # إزالة أكواد JavaScript وCSS المضمنة
    text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style.*?</style>', '', text, flags=re.DOTALL)
    
    # إزالة تعليقات HTML
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # إزالة جميع وسوم HTML المتبقية
    text = BeautifulSoup(text, "html.parser").get_text()
    
    return text

# تطبيق التنظيف على النصوص في عمود 'text' من البيانات
data['cleaned_text'] = data['text'].apply(clean_html)

# عرض بعض النصوص قبل وبعد التنظيف
print(data[['text', 'cleaned_text']].head())


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
import re

def clean_text(text):
   
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
   
    text = re.sub(r'\d+', '', text)
    
    # إزالة جميع الأحرف الفردية
    text = re.sub(r'\b\w\b', '', text)
    
    # إزالة الأحرف الفردية من البداية
    text = re.sub(r'^\w\s+', '', text)
    
    # استبدال المسافات المتعددة بمسافة واحدة
    text = re.sub(r'\s+', ' ', text).strip()
    
    # إزالة البادئة 'b' (في حال وجودها من تحويل البيانات)
    text = re.sub(r'\bb', '', text)
    
    # تحويل النص إلى أحرف صغيرة
    text = text.lower()
    
    return text

# تطبيق الدالة على النصوص بعد تنظيف HTML
data['cleaned_text'] = data['cleaned_text'].apply(clean_text)

# عرض بعض البيانات بعد التنظيف
print(data[['cleaned_text']].head())


## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

data['cleaned_text'] = data['cleaned_text'].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in stop_words]))
print(data[['cleaned_text']].head())


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
from nltk.stem import WordNetLemmatizer

# تحميل القاموس اللازم للتصريف الأساسي
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# تطبيق التصريف الأساسي على النصوص
data['cleaned_text'] = data['cleaned_text'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))

# عرض عينة من البيانات بعد التصريف
print(data[['cleaned_text']].head())


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(max_features=10)  
X = vectorizer.fit_transform(data['cleaned_text'])
print(vectorizer.get_feature_names_out())


## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer(max_features=1000)  
X = vectorizer.fit_transform(data['cleaned_text'])


bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# عرض أول 5 صفوف
print(bow_df.head())


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# تحميل وإنشاء TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # تحديد عدد الكلمات الأكثر شيوعًا
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_text'])

# طباعة أبعاد البيانات بعد تحويلها
print("Shape of the vectorized dataset:", X_tfidf.shape)


## And the Train a Classifier?

In [None]:



X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['label'], test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, classification_report

vectorizers = {
    "CountVectorizer": CountVectorizer(max_features=2000, ngram_range=(1,2)),
    "TfidfVectorizer": TfidfVectorizer(max_features=2000, ngram_range=(1,2))
}

best_accuracy = 0
best_vectorizer = None

for name, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform(data['cleaned_text'])
    X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)
    

    svd = TruncatedSVD(n_components=300)
    X_train = svd.fit_transform(X_train)
    X_test = svd.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy:", acc)

    if acc > best_accuracy:
        best_accuracy = acc
        best_vectorizer = name
print(f"\n✅ أفضل تمثيل للميزات هو: {best_vectorizer} بدقة {best_accuracy:.4f}")
X = vectorizer.fit_transform(data['text_cleaned'])
print(data.head())   
X = vectorizer.fit_transform(data['text_cleaned'])
