In [232]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Data

In [113]:
def preprocess_text(text):
    cleaned_text = ''.join(char for char in text if char.isalpha() or char.isspace())
    cleaned_text = ' '.join(cleaned_text.split())
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [115]:
df = pd.read_csv('data/SMS.tsv', sep='\t')
y = pd.get_dummies(df['class']).ham
X = df.text.map(preprocess_text)
nltk.download('stopwords')
tfid = TfidfVectorizer(max_features=2000, stop_words=stopwords.words('english'))
X = tfid.fit_transform(X)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Embedded method

In [116]:
importances = RandomForestClassifier().fit(X, y).feature_importances_
features = tfid.get_feature_names_out()
features_emb = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)
features_emb = [feature for feature, importance in features_emb if importance > np.mean(importances)]

In [117]:
print(len(features_emb), len(features))

305 2000


# Wrapper method

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [119]:
X_tr = X_train.toarray()
X_ts = X_test.toarray()

In [120]:
def get_matrix(m, col):
    matrix = []
    for vec in m:
        new_vec = []
        for j in col:
            new_vec.append(vec[j])
        matrix.append(new_vec)
    return matrix

In [None]:
best_accuracy = 0.0
best_features = []
while len(best_features) < 30:
    best_columns = best_features.copy()
    for k, feature in enumerate(features):
        print(k, best_features, len(best_features), best_accuracy)
        if not k in best_features:
            columns = best_columns.copy()
            columns.append(k)
            matrix = get_matrix(X_tr, columns)
            model = GaussianNB()
            model.fit(matrix, y_train)
            matrix = get_matrix(X_ts, columns)
            y_pred = model.predict(matrix)
            acc = accuracy_score(y_test, y_pred)
            if acc > best_accuracy:
                best_accuracy = acc
                best_features = columns

In [130]:
features_wr = [features[k] for k in best_features]

# Filter method

In [184]:
filtered = []
features_fl = []
matrix = abs(np.corrcoef(X.toarray(), rowvar=False))
for p in np.argwhere(matrix > matrix.flatten().mean()):
    p =  tuple(p)
    i, j = p
    if i == j:
        continue
    if not i in features_fl and not i in filtered:
        features_fl.append(i)
        filtered.append(j)

In [185]:
features_fm = [features[i] for i in range(len(features)) if not i in filtered]

# Train / Test

In [187]:
print(features_emb[:30])
print(features_wr)
print(features_fm[:30])

['call', 'txt', 'free', 'mobile', 'claim', 'prize', 'text', 'win', 'stop', 'service', 'reply', 'urgent', 'nokia', 'customer', 'new', 'tone', 'contact', 'per', 'ringtone', 'guaranteed', 'cash', 'chat', 'awarded', 'pmin', 'apply', 'box', 'ppm', 'pobox', 'landline', 'draw']
['free', 'reply', 'urgent', 'receive', 'mobile', 'im', 'txt', 'claim', 'experience', 'box', 'service', 'sat', 'hey', 'ill', 'login', 'also', 'apply', 'going', 'cancel', 'code', 'brought', 'bugis', 'age', 'attempt', 'auction', 'may', 'daily', 'put', 'lucky', 'sir']
['aiyah', 'appreciate', 'argue', 'argument', 'arms', 'arrested', 'asap', 'asks', 'asleep', 'assume', 'ate', 'attend', 'bag', 'bahamas', 'bak', 'balance', 'barely', 'basic', 'basically', 'bath', 'bathe', 'bathing', 'battery', 'bay', 'bcums', 'bday', 'becoz', 'bed', 'bedroom', 'beer']


In [244]:
model = SelectKBest(chi2, k=300)
model.fit(X, y)
print(features[model.get_support()][:30])
X_new_chi2 = model.transform(X)
model = SelectFromModel(LogisticRegression(penalty="l1", dual=False, solver='liblinear').fit(X, y), prefit=True)
print(features[model.get_support()][:30])
X_new_reg = model.transform(X)
model = SelectKBest(f_classif, k=300)
model.fit(X, y)
print(features[model.get_support()][:30])
X_new_f = model.transform(X)

['ac' 'account' 'action' 'admirer' 'age' 'alert' 'ampm' 'announcement'
 'anytime' 'apply' 'arcade' 'arrive' 'attempt' 'auction' 'await'
 'awaiting' 'award' 'awarded' 'bid' 'bluetooth' 'bonus' 'box'
 'btnationalrate' 'bx' 'call' 'caller' 'camcorder' 'camera' 'cash'
 'cashbalance']
['ac' 'access' 'admirer' 'age' 'apply' 'attempt' 'award' 'awarded' 'bid'
 'box' 'call' 'camera' 'cant' 'cash' 'charity' 'chat' 'choose' 'claim'
 'club' 'code' 'collect' 'collection' 'come' 'congrats' 'contact'
 'content' 'cost' 'credits' 'currently' 'customer']
['account' 'admirer' 'ae' 'age' 'ampm' 'announcement' 'ansr' 'apply'
 'attempt' 'auction' 'await' 'awaiting' 'award' 'awarded' 'bluetooth'
 'bonus' 'box' 'btnationalrate' 'bx' 'call' 'caller' 'camcorder' 'camera'
 'cash' 'cashbalance' 'cashin' 'cc' 'cd' 'cds' 'chance']


In [245]:
X_ = pd.DataFrame(X.toarray(), columns=features)

In [246]:
X_chi2_train, X_chi2_test, y_train, y_test = train_test_split(X_new_chi2, y, test_size=0.2, random_state=42)
X_reg_train, X_reg_test, y_train, y_test = train_test_split(X_new_reg, y, test_size=0.2, random_state=42)
X_f_train, X_f_test, y_train, y_test = train_test_split(X_new_f, y, test_size=0.2, random_state=42)
X_emb_train, X_emb_test, y_train, y_test = train_test_split(X_[features_emb], y, test_size=0.2, random_state=42)
X_fm_train, X_fm_test, y_train, y_test = train_test_split(X_[features_fm], y, test_size=0.2, random_state=42)
X_wr_train, X_wr_test, y_train, y_test = train_test_split(X_[features_wr], y, test_size=0.2, random_state=42)

In [247]:
print('Logistic Regression')
print(f'default: {accuracy_score(y_test, LogisticRegression().fit(X_train, y_train).predict(X_test))}')
print(f'chi2: {accuracy_score(y_test, LogisticRegression().fit(X_chi2_train, y_train).predict(X_chi2_test))}')
print(f'l1-based selection: {accuracy_score(y_test, LogisticRegression().fit(X_reg_train, y_train).predict(X_reg_test))}')
print(f'f-test: {accuracy_score(y_test, LogisticRegression().fit(X_f_train, y_train).predict(X_f_test))}')
print(f'random forest: {accuracy_score(y_test, LogisticRegression().fit(X_emb_train, y_train).predict(X_emb_test))}')
print(f'correlation: {accuracy_score(y_test, LogisticRegression().fit(X_fm_train, y_train).predict(X_fm_test))}')
print(f'Bayes wrapper: {accuracy_score(y_test, LogisticRegression().fit(X_wr_train, y_train).predict(X_wr_test))}')

Logistic Regression
default: 0.8663677130044843
chi2: 0.95695067264574
l1-based selection: 0.9533632286995516
f-test: 0.957847533632287
random forest: 0.9560538116591928
correlation: 0.9632286995515695
Bayes wrapper: 0.9103139013452914


In [248]:
print('KNN')
print(f'default: {accuracy_score(y_test, KNeighborsClassifier().fit(X_train, y_train).predict(X_test))}')
print(f'chi2: {accuracy_score(y_test, KNeighborsClassifier().fit(X_chi2_train, y_train).predict(X_chi2_test))}')
print(f'l1-based selection: {accuracy_score(y_test, KNeighborsClassifier().fit(X_reg_train, y_train).predict(X_reg_test))}')
print(f'f-test: {accuracy_score(y_test, KNeighborsClassifier().fit(X_f_train, y_train).predict(X_f_test))}')
print(f'random forest: {accuracy_score(y_test, KNeighborsClassifier().fit(X_emb_train, y_train).predict(X_emb_test))}')
print(f'correlation: {accuracy_score(y_test, KNeighborsClassifier().fit(X_fm_train, y_train).predict(X_fm_test))}')
print(f'Bayes wrapper: {accuracy_score(y_test, KNeighborsClassifier().fit(X_wr_train, y_train).predict(X_wr_test))}')

KNN
default: 0.862780269058296
chi2: 0.9381165919282511
l1-based selection: 0.9542600896860987
f-test: 0.9381165919282511
random forest: 0.9345291479820628
correlation: 0.9246636771300448
Bayes wrapper: 0.947085201793722


In [249]:
print('Decision tree')
print(f'default: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_train, y_train).predict(X_test))}')
print(f'chi2: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_chi2_train, y_train).predict(X_chi2_test))}')
print(f'l1-based selection: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_reg_train, y_train).predict(X_reg_test))}')
print(f'f-test: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_f_train, y_train).predict(X_f_test))}')
print(f'random forest: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_emb_train, y_train).predict(X_emb_test))}')
print(f'correlation: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_fm_train, y_train).predict(X_fm_test))}')
print(f'Bayes wrapper: {accuracy_score(y_test, DecisionTreeClassifier().fit(X_wr_train, y_train).predict(X_wr_test))}')

Decision tree
default: 0.7838565022421524
chi2: 0.9623318385650225
l1-based selection: 0.967713004484305
f-test: 0.9659192825112107
random forest: 0.957847533632287
correlation: 0.9623318385650225
Bayes wrapper: 0.9426008968609866
