# Pratical Exam

-> Mail filtering

## Librairies

In [343]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
from sklearn.model_selection import StratifiedKFold

In [344]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Import data

In [345]:
df = pd.read_csv('/content/email_spam.csv')

In [346]:
df.head()

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


## 1. Preprocessing

In [347]:
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [348]:
def preprocessing_data(df):
  # Delete NA
  df = df.dropna(subset=['title', 'text'])

  # Lowercase
  df['title'] = df['title'].str.lower()
  df['text'] = df['text'].str.lower()

  # Remove punctuaction
  df['title'] = df['title'].str.replace(r'[^\w\s]', '', regex=True)
  df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)

  # Remove stopwords
  df['title'] = df['title'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
  df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))

  # Lemmatisation
  df['title'] = df['title'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
  df['text'] = df['text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

  # Tokenization
  df['title'] = df['title'].apply(word_tokenize)
  df['text'] = df['text'].apply(word_tokenize)

  return df

In [349]:
df_prepro = preprocessing_data(df)

In [350]:
df_prepro.head()

Unnamed: 0,title,text,type
0,"[secret, success]","[hi, james, claim, complimentary, gift, yet, i...",spam
1,"[earned, 500, gcloot, point]","[alt_text, congratulation, earned, 500, comple...",not spam
2,"[github, launch, code]","[here, github, launch, code, mortyj420, octoca...",not spam
3,"[virtual, reward, center, clarification]","[hello, thank, contacting, virtual, reward, ce...",not spam
4,"[101, mlb, expert, inside, plus, everything, n...","[hey, prachanda, rawal, today, newsletter, jam...",spam


## 2. Feature extraction and Feature engineering

-> TF-IDF

### Feature extraction

In [351]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

In [352]:
def apply_tfidf(df, column, vectorizer):
    df[column + '_str'] = df[column].apply(lambda tokens: ' '.join(tokens))

    tfidf_matrix = vectorizer.fit_transform(df[column + '_str'])

    return pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=vectorizer.get_feature_names_out(),
        index=df.index
    )

In [353]:
tfidf_df_title = apply_tfidf(df_prepro, 'title', tfidf)
tfidf_df_text = apply_tfidf(df_prepro, 'text', tfidf)

In [354]:
data_tfidf = pd.concat([tfidf_df_title, tfidf_df_text], axis=1)

In [355]:
data_tfidf.head()

Unnamed: 0,101,101 mlb,15,15 year,17,17 2023,180month,19,19 new,2023,...,yuyangappencom email,za,za cbc,zandi,zandi tamane,zeroed,zeroed huge,zodiac,zodiac sign,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.258199,0.258199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.025679,0.025679,0.0,0.0,0.0


### Feature engineering

In [356]:
keywords = ['win', 'free', 'urgent', 'offer', 'congratulations', 'prize']

for keyword in keywords:
    data_tfidf[f'title_contains_{keyword}'] = df['title'].apply(lambda x: int(keyword in x.lower()))

for keyword in keywords:
    data_tfidf[f'contains_{keyword}'] = df['text'].apply(lambda x: int(keyword in x.lower()))

In [357]:
data_tfidf['title_length'] = df['title'].apply(len)

data_tfidf['text_length'] = df['text'].apply(len)

In [358]:
data_tfidf['num_uppercase_title'] = df['title'].apply(lambda x: sum(1 for char in x if char.isupper()))

data_tfidf['num_uppercase'] = df['text'].apply(lambda x: sum(1 for char in x if char.isupper()))

In [359]:
data_tfidf.head()

Unnamed: 0,101,101 mlb,15,15 year,17,17 2023,180month,19,19 new,2023,...,contains_win,contains_free,contains_urgent,contains_offer,contains_congratulations,contains_prize,title_length,text_length,num_uppercase_title,num_uppercase
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,25,302,7,9
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,1,0,31,350,6,23
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,26,166,3,10
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,49,399,6,23
4,0.258199,0.258199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,1,0,0,79,6079,24,591


## 3. Model Training

-> SVM with Cross validation

In [360]:
X = data_tfidf
y = df['type']

In [361]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [362]:
model = SVC(kernel='linear', C=1)

skf = StratifiedKFold(n_splits=5)

fold_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)

    fold_scores.append(score)

In [363]:
print("Scores for each fold:", fold_scores)
print("Mean of scores:", np.mean(fold_scores))

Scores for each fold: [0.7647058823529411, 0.7058823529411765, 0.7058823529411765, 0.8823529411764706, 0.75]
Mean of scores: 0.761764705882353


## 4. Model Testing

In [364]:
y_pred = model.predict(X_test)

In [365]:
print(f"Precision: {precision_score(y_test, y_pred, pos_label='not spam')}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Precision: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

    not spam       0.73      1.00      0.85        11
        spam       1.00      0.20      0.33         5

    accuracy                           0.75        16
   macro avg       0.87      0.60      0.59        16
weighted avg       0.82      0.75      0.69        16



# Test

In [366]:
test = input("please inter your samlpe email for test:")

please inter your samlpe email for test:test


In [373]:
# Supposons que 'keywords' et 'apply_tfidf' sont déjà définis dans votre code.
# Nous allons maintenant corriger et compléter le processus pour appliquer le test.

# Demande d'entrée pour le test
title_test = input("Please enter your sample title email for testing: ")
test = input("Please enter your sample email for testing: ")

# Créer un DataFrame pour les données de test
df_test = pd.DataFrame({'title': [title_test], 'text': [test]})

# Appliquer le prétraitement (utilise la fonction que vous avez définie)
df_test_prepro = preprocessing_data(df_test)

# Appliquer TF-IDF aux titres et textes
# Vous devez vous assurer que la fonction 'apply_tfidf' est correctement définie.
# Si la fonction n'est pas encore définie, voici un exemple :

def apply_tfidf(df, column_name, tfidf_vectorizer):
    """
    Applique le TF-IDF sur la colonne spécifiée du DataFrame
    et retourne un DataFrame avec les caractéristiques TF-IDF.
    """
    tfidf_matrix = tfidf_vectorizer.transform(df[column_name])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index)
    return tfidf_df

# Appliquer TF-IDF sur le titre et le texte
tfidf_df_title = apply_tfidf(df_test_prepro, 'title', tfidf)
tfidf_df_text = apply_tfidf(df_test_prepro, 'text', tfidf)

# Concaténer les deux DataFrames TF-IDF
test_tfidf = pd.concat([tfidf_df_title, tfidf_df_text], axis=1)

# Ajouter des caractéristiques supplémentaires basées sur les mots-clés et la longueur des textes
keywords = ['urgent', 'offer', 'free']  # Exemple de mots-clés

# Ajouter les colonnes pour chaque mot-clé dans le titre
for keyword in keywords:
    test_tfidf[f'title_contains_{keyword}'] = df_test['title'].apply(lambda x: int(keyword in x.lower()))

# Ajouter les colonnes pour chaque mot-clé dans le texte
for keyword in keywords:
    test_tfidf[f'contains_{keyword}'] = df_test['text'].apply(lambda x: int(keyword in x.lower()))

# Ajouter des caractéristiques de longueur du texte
test_tfidf['title_length'] = df_test['title'].apply(len)
test_tfidf['text_length'] = df_test['text'].apply(len)

# Ajouter des caractéristiques basées sur le nombre de majuscules dans le titre et le texte
test_tfidf['num_uppercase_title'] = df_test['title'].apply(lambda x: sum(1 for char in x if char.isupper()))
test_tfidf['num_uppercase'] = df_test['text'].apply(lambda x: sum(1 for char in x if char.isupper()))

# Afficher les résultats du DataFrame transformé
print(test_tfidf.head())

Please enter your sample title email for testing: test
Please enter your sample email for testing: test


AttributeError: 'list' object has no attribute 'lower'

In [368]:
test_tfidf.head()

Unnamed: 0,test,test.1,title_contains_win,title_contains_free,title_contains_urgent,title_contains_offer,title_contains_congratulations,title_contains_prize,contains_win,contains_free,contains_urgent,contains_offer,contains_congratulations,contains_prize,title_length,text_length,num_uppercase_title,num_uppercase
0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,0,0


In [369]:
X_test.head()

Unnamed: 0,101,101 mlb,15,15 year,17,17 2023,180month,19,19 new,2023,...,contains_win,contains_free,contains_urgent,contains_offer,contains_congratulations,contains_prize,title_length,text_length,num_uppercase_title,num_uppercase
61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,23,540,2,15
63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,42,589,0,23
70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,35,697,1,32
71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,32,474,2,11
72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,35,511,5,31


In [370]:
y_pred = model.predict(test_tfidf)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- 0127120057
- 020
- 020 999
- 0711
- 0711 099
- ...
