In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd  # reads the data into a pandas dataframe

In [2]:
spam_data = pd.read_csv('SPAM text.csv')

In [3]:
spam_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
spam_data['label'] = spam_data.Category.map({'ham': 1, 'spam': 0})

In [7]:
spam_data

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,0
5568,ham,Will ü b going to esplanade fr home?,1
5569,ham,"Pity, * was in mood for that. So...any other s...",1
5570,ham,The guy did some bitching but I acted like i'd...,1


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    spam_data.Message,
    spam_data.label,
    test_size=0.2,
    random_state=2000,
    stratify=spam_data.label
)

In [17]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (4457,)
Shape of X_test:  (1115,)


In [18]:
y_train.value_counts()
y_test.value_counts()

label
1    966
0    149
Name: count, dtype: int64

In [19]:
from sklearn.neighbors import KNeighborsClassifier # use KNN to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.29      0.45       149
           1       0.90      1.00      0.95       966

    accuracy                           0.90      1115
   macro avg       0.95      0.64      0.70      1115
weighted avg       0.91      0.90      0.88      1115



In [21]:
X_test[:5]

3924    Okay lor... Will they still let us go a not ah...
4883                  As usual u can call me ard 10 smth.
3677    Hey r ü still online? I've finished the format...
3928    Lol ... I really need to remember to eat when ...
3809                Mm you ask him to come its enough :-)
Name: Message, dtype: object

In [22]:
y_test[:5]

3924    1
4883    1
3677    1
3928    1
3809    1
Name: label, dtype: int64

In [23]:
y_pred[:5]

array([1, 1, 1, 1, 1])

In [24]:
from sklearn.ensemble import RandomForestClassifier # use Random Forest to analyze the data

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.83      0.90       149
           1       0.97      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [25]:
from sklearn.linear_model import LogisticRegression # use Logistic Regression to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create the pipeline with TF-IDF vectorizer and Logistic Regression
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression(max_iter=1000))  # max_iter helps avoid convergence warnings
])

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.79      0.88       149
           1       0.97      1.00      0.98       966

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [27]:
import spacy
import pydantic
print(f"spaCy version: {spacy.__version__}")
print(f"pydantic version: {pydantic.VERSION}")

spaCy version: 3.8.3
pydantic version: 2.10.6


In [28]:
nlp = spacy.load("en_core_web_sm")

In [29]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)        

In [30]:
spam_data['preprocessed_txt'] = spam_data['Message'].apply(preprocess)

In [31]:
spam_data.head()

Unnamed: 0,Category,Message,label,preprocessed_txt
0,ham,"Go until jurong point, crazy.. Available only ...",1,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,1,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,1,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",1,Nah think go usf live


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    spam_data.preprocessed_txt,
    spam_data.label,
    test_size=0.2,
    random_state=2000,
    stratify=spam_data.label
)

In [34]:
from sklearn.neighbors import KNeighborsClassifier # use KNN to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.28      0.43       149
           1       0.90      1.00      0.95       966

    accuracy                           0.90      1115
   macro avg       0.95      0.64      0.69      1115
weighted avg       0.91      0.90      0.88      1115



In [35]:
from sklearn.ensemble import RandomForestClassifier # use Random Forest to analyze the data

clf = Pipeline([
    ('vectorizer_tfidf' ,TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

# Train the model
clf.fit(X_train, y_train)

# Make prediction
y_pred = clf.predict(X_test)

# Evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93       149
           1       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [36]:
from sklearn.linear_model import LogisticRegression # use Logistic Regression to analyze the data
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create the pipeline with TF-IDF vectorizer and Logistic Regression
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('log_reg', LogisticRegression(max_iter=1000))  # max_iter helps avoid convergence warnings
])

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.78      0.87       149
           1       0.97      1.00      0.98       966

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

