In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("sms_spam.csv")
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
df.sms[:5]

0    Go until jurong point, crazy.. Available only ...
1                      Ok lar... Joking wif u oni...\n
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: sms, dtype: object

In [7]:
import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model


In [8]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [9]:
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
        


In [11]:

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return wv.get_mean_vector(filtered_tokens)

In [13]:
df['sms_vectors'] = df['sms'].apply(lambda text: preprocess_and_vectorize(text))
df.head()

Unnamed: 0,sms,label,sms_vectors
0,"Go until jurong point, crazy.. Available only ...",0,"[0.0056068725, 0.008481386, 0.0075977487, 0.07..."
1,Ok lar... Joking wif u oni...\n,0,"[-0.03199796, 0.023136022, 0.016629472, 0.0432..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[0.0060087782, -0.018206825, -0.020749342, 0.0..."
3,U dun say so early hor... U c already then say...,0,"[-0.021598155, 0.010206472, 0.04870629, 0.0465..."
4,"Nah I don't think he goes to usf, he lives aro...",0,"[0.02604448, 0.012574284, 0.019120568, 0.07503..."


In [14]:
df['sms_clean'] = df['sms'].apply(lambda text: preprocess(text))
df.head()

Unnamed: 0,sms,label,sms_vectors,sms_clean
0,"Go until jurong point, crazy.. Available only ...",0,"[0.0056068725, 0.008481386, 0.0075977487, 0.07...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...\n,0,"[-0.03199796, 0.023136022, 0.016629472, 0.0432...",ok lar joke wif u oni \n
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[0.0060087782, -0.018206825, -0.020749342, 0.0...",free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...,0,"[-0.021598155, 0.010206472, 0.04870629, 0.0465...",U dun say so early hor U c already then say \n
4,"Nah I don't think he goes to usf, he lives aro...",0,"[0.02604448, 0.012574284, 0.019120568, 0.07503...",nah I do not think he go to usf he live around...


In [15]:
df.sms_vectors

0       [0.0056068725, 0.008481386, 0.0075977487, 0.07...
1       [-0.03199796, 0.023136022, 0.016629472, 0.0432...
2       [0.0060087782, -0.018206825, -0.020749342, 0.0...
3       [-0.021598155, 0.010206472, 0.04870629, 0.0465...
4       [0.02604448, 0.012574284, 0.019120568, 0.07503...
                              ...                        
5569    [-0.00016696246, 0.011007422, 0.016672406, 0.0...
5570    [0.013308475, 0.031782545, 0.0286113, 0.049512...
5571    [-0.008843743, -0.0038144826, 0.033675347, 0.0...
5572    [0.038318954, 0.014475661, 0.02008273, 0.06044...
5573    [0.03197228, 0.061292417, 0.054247297, 0.04297...
Name: sms_vectors, Length: 5574, dtype: object

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df.sms_vectors.values,  #ovdje sada imamo array u arrayu
    df.label,
    random_state=2022,
    stratify=df.label,
    test_size=0.2
)

In [18]:
X_train_2d = np.stack(X_train) 
X_test_2d = np.stack(X_test) 

In [19]:
X_test_2d.shape

(1115, 300)

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

In [27]:
clf = GradientBoostingClassifier()

clf.fit(X_train_2d,y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [89]:
test_sms = [ "Hi there! How are you doing today?",
            "Congratulations! You've won a free vacation. Click here to claim your prize now!",
            "You've been selected for a special offer. Buy one, get one free!",
            "Reminder: Your appointment is scheduled for tomorrow at 2 PM."
]

test_sms_vectors = [preprocess_and_vectorize(n) for n in test_sms]
prediction = clf.predict(test_sms_vectors)
prediction

array([0, 1, 1, 0], dtype=int64)

In [85]:
def spam_nospam_detection(text):
    #test_sms_vectors = [preprocess_and_vectorize(n) for n in text]
    test_sms_vectors = [preprocess_and_vectorize(text)]
    prediction_array = clf.predict(test_sms_vectors)

    for prediction in prediction_array:
        if prediction == 0:
            print(f"Poslana poruka: '{text}' nije spam.")
        else:
            print(f"Poslana poruka: '{text}' je spam.")


In [99]:
spam_nospam_detection("You've been selected for a special offer. Buy one, get one free!")

Poslana poruka: 'You've been selected for a special offer. Buy one, get one free!' je spam.
