In [1]:
import nltk 
import string
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

ps = nltk.PorterStemmer()

In [2]:
# Importing csv and converting it to a dataframe
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None, names=['label', 'body'])
data.head()

Unnamed: 0,label,body
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
# Data Preprocessing
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / (len(text) - text.count(" ")), 3) * 100

data['body_len'] = data['body'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body'].apply(lambda x: count_punct(x))
data.head()

stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

X_train, X_test, y_train, y_test = train_test_split(data[['body', 'body_len', 'punct%']], data['label'], test_size=0.2)

In [4]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body'])

tfidf_train = tfidf_vect_fit.transform(X_train['body'])
tfidf_test = tfidf_vect_fit.transform(X_test['body'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True),
                          pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True),
                         pd.DataFrame(tfidf_test.toarray())], axis=1)
# Convert feature names to strings
X_train_vect.columns = X_train_vect.columns.astype(str)
X_test_vect.columns = X_test_vect.columns.astype(str)

In [5]:
# # Define parameter grid for RFC
# rfc_param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 5, 10, 20]
# }

# # Define parameter grid for GBC
# gbc_param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.1, 0.2, 0.3]
# }

# # Create instances of RFC and GBC
# rfc = RandomForestClassifier()
# gbc = GradientBoostingClassifier()

# # Perform GridSearchCV for RFC
# rfc_grid = GridSearchCV(rfc, param_grid=rfc_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
# rfc_grid.fit(X_train_vect, y_train)

# rfc_best_params = rfc_grid.best_params_
# rfc_best_model = rfc_grid.best_estimator_

# # Perform GridSearchCV for GBC
# gbc_grid = GridSearchCV(gbc, param_grid=gbc_param_grid, scoring='accuracy', cv=5, n_jobs=-1)
# gbc_grid.fit(X_train_vect, y_train)

# gbc_best_params = gbc_grid.best_params_
# gbc_best_model = gbc_grid.best_estimator_

# # Evaluate the best models on the test data
# rfc_pred = rfc_best_model.predict(X_test_vect)
# rfc_accuracy = accuracy_score(y_test, rfc_pred)
# rfc_precision = precision_score(y_test, rfc_pred, pos_label='spam')
# rfc_recall = recall_score(y_test, rfc_pred, pos_label='spam')

# gbc_pred = gbc_best_model.predict(X_test_vect)
# gbc_accuracy = accuracy_score(y_test, gbc_pred)
# gbc_precision = precision_score(y_test, gbc_pred, pos_label='spam')
# gbc_recall = recall_score(y_test, gbc_pred, pos_label='spam')

# print("RFC - Best Parameters:", rfc_best_params)
# print("RFC - Accuracy:", rfc_accuracy)
# print("RFC - Precision:", rfc_precision)
# print("RFC - Recall:", rfc_recall)

# print("GBC - Best Parameters:", gbc_best_params)
# print("GBC - Accuracy:", gbc_accuracy)
# print("GBC - Precision:", gbc_precision)
# print("GBC - Recall:", gbc_recall)

In [6]:
rfc = RandomForestClassifier(n_estimators=100,max_depth=None)
rfc.fit(X_train_vect, y_train)
rfc_pred = rfc.predict(X_test_vect)
rfc_accuracy = accuracy_score(y_test, rfc_pred)
rfc_precision = precision_score(y_test, rfc_pred, pos_label='spam')
rfc_recall = recall_score(y_test, rfc_pred, pos_label='spam')