<h1 align="center">Machine Learning for NLP</h1>
    <h2 align="center">LDA & SVM</h2>
    <h3 align="center">Zahra Amini</h3>
<div style="width: 100%; text-align: center;">
    <table>
        <tr>
            <td>
                <a class="link" href="https://t.me/Zahraamini_ai">Telegram</a><br>
                <a class="link" href="https://www.linkedin.com/in/zahraamini-ai/">LinkedIn</a><br>
                <a class="link" href="https://www.youtube.com/@AcademyHobot">YouTube</a><br>
            </td>
            <td>
                <a class="link" href="https://github.com/aminizahra">GitHub</a><br>
                <a class="link" href="https://www.kaggle.com/aminizahra">Kaggle</a><br>
                <a class="link" href="https://www.instagram.com/zahraamini_ai/">Instagram</a><br>
            </td>
        </tr>
    </table>
</div>

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
file_path = "SMS Spam Dataset/spam.csv"
data = pd.read_csv(file_path, encoding='latin-1')

In [10]:
data = data[['v1', 'v2']]

In [12]:
data.columns = ['label', 'message']

In [14]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Step 1: Clean the text
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    
    # Tokenization
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    return " ".join(tokens)

In [16]:
data['processed_message'] = data['message'].apply(preprocess_text)

In [18]:
data['label'] = data['label'].map({'ham':0, 'spam':1})

In [20]:
data

Unnamed: 0,label,message,processed_message
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...
1,0,Ok lar... Joking wif u oni...,lar joking wif oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,0,U dun say so early hor... U c already then say...,dun say early hor already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,time tried contact pound prize claim easy call...
5568,0,Will Ì_ b going to esplanade fr home?,going esplanade home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [22]:
X = data['processed_message']
y=data['label']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [26]:
def lda_svm_pipeline():
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=1000)),
        ('lda', LatentDirichletAllocation(n_components=10, random_state=42)),
        ('svm', SVC(kernel='linear', random_state=42))
    ])
    return pipeline

In [28]:
pipeline = lda_svm_pipeline()

In [30]:
# CV cross-vali
param_grid = {
    'lda__n_components': [5, 10, 15, 20, 30],
    'svm__C': [0.1, 1, 10],
    'svm__gamma': ['scale', 'auto']
}

In [32]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

In [34]:
grid_search.fit(X_train, y_train)

In [35]:
grid_search.best_params_

{'lda__n_components': 20, 'svm__C': 10, 'svm__gamma': 'scale'}

In [38]:
best_model =grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [40]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94      1035
           1       0.37      0.70      0.49        80

    accuracy                           0.89      1115
   macro avg       0.67      0.80      0.71      1115
weighted avg       0.93      0.89      0.91      1115



In [44]:
# X_train.shape