In [42]:
import pandas as pd
import zipfile
import spacy
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# load data
with zipfile.ZipFile('/content/sms+spam+collection.zip', 'r') as zip_ref:
    zip_ref.extractall('sms_data')

df = pd.read_csv('sms_data/SMSSpamCollection', sep='\t', names=['label', 'text'])
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [43]:
# Preprocessing the text
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = text.lower()
    # tokenize, remove stopwords, and lemmatize
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

df['processed_text'] = df['text'].apply(preprocess_text)

# feature 1
def classify_by_token_length(tokens):
    return 1 if len(tokens) > 13 else 0
    # 1 = spam ; 0 = ham

df['token_length_class'] = df['processed_text'].apply(classify_by_token_length)

# feature 2
common_spam_words = ['prize', 'win', 'free', 'money', 'cash', 'mobile']
# most frequent words in spam messages

def contains_spam_words(tokens):
    return any(word in tokens for word in common_spam_words)

df['contains_spam_words'] = df['processed_text'].apply(contains_spam_words).astype(int)

# encode labels
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})
df

Unnamed: 0,label,text,processed_text,token_length_class,contains_spam_words,label_encoded
0,ham,"Go until jurong point, crazy.. Available only ...","[jurong, point, crazy, available, bugis, n, gr...",1,0,0
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]",0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",1,1,1
3,ham,U dun say so early hor... U c already then say...,"[u, dun, early, hor, u, c]",0,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, go, usf, live]",0,0,0
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[2nd, time, try, 2, contact, u., u, win, £, 75...",1,1,1
5568,ham,Will ü b going to esplanade fr home?,"[ü, b, go, esplanade, fr, home]",0,0,0
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestion]",0,0,0
5570,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, act, like, interested, buy, we...",0,1,0


In [44]:
# func to calculate information gain
def calculate_information_gain(df, feature_col, target_col):
    original_entropy = -sum((df[target_col].value_counts(normalize=True) *
                              np.log2(df[target_col].value_counts(normalize=True))).fillna(0))

    feature_values = df[feature_col].value_counts()
    subset_entropy = 0

    for value in feature_values.index:
        subset = df[df[feature_col] == value]
        proportion = len(subset) / len(df)
        subset_entropy_value = -sum((subset[target_col].value_counts(normalize=True) *
                                      np.log2(subset[target_col].value_counts(normalize=True))).fillna(0))
        subset_entropy += proportion * subset_entropy_value

    info_gain = original_entropy - subset_entropy
    return info_gain

# information gain for features
info_gain_length = calculate_information_gain(df, 'token_length_class', 'label_encoded')
info_gain_spam_words = calculate_information_gain(df, 'contains_spam_words', 'label_encoded')

print(f'Information Gain for Token Length Classification: {info_gain_length:.4f}')
print(f'Information Gain for Common Spam Words: {info_gain_spam_words:.4f}')

Information Gain for Token Length Classification: 0.1713
Information Gain for Common Spam Words: 0.1604


In [45]:
X = df[['token_length_class', 'contains_spam_words']]
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [46]:
X.head()

Unnamed: 0,token_length_class,contains_spam_words
0,1,0
1,0,0
2,1,1
3,0,0
4,0,0


In [47]:
y.head()

Unnamed: 0,label_encoded
0,0
1,0
2,1
3,0
4,0


In [48]:
y_train.value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,3859
1,598


In [49]:
y_test.value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,966
1,149


In [50]:
def classifier(row):
    if row['token_length_class'] == 1:
        if row['contains_spam_words'] == 1:
           return 1  #spam
        else:
          return 0   #ham
    else:
        if row['contains_spam_words'] == 1:
           return 1  #spam
        else:
          return 0  #ham

y_train_pred = X_train.apply(classifier, axis=1)
y_test_pred = X_test.apply(classifier, axis=1)

# evaluate the classifier
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Testing Accuracy: {test_accuracy:.2f}')

Training Accuracy: 0.91
Testing Accuracy: 0.92
