In [1]:
import pandas as pd
import re
import string
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')

ps = nltk.PorterStemmer()

pd.set_option ('display.max_colwidth',100)

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)
data.columns = ['label','body_text']

# function to count puncutuation % - feature engineering
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation]) # count and aggregate
    return round(count/(len(text) - text.count(" ")),3) * 100 #calculate percentage

data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))


# line length - feature engineering

data['body_len']=data['body_text'].apply(lambda x: len(x) - x.count(" "))


def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation]) # remove punctuation
    tokens = re.split('\W+',text) #tokenize sentence, note the regular expression used
    text = [ps.stem(word) for word in tokens if word not in stopwords] # remove stop words, join to form sentence
    return text

tfidf_vector = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vector.fit_transform(data['body_text'])

x_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(x_tfidf.toarray())], axis=1)
x_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x_features,data['label'], test_size=0.2)

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rd = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rd_model = rd.fit(x_train,y_train)

In [25]:
sorted(zip(rd_model.feature_importances_,x_train.columns),reverse=True)[0:10]
# from output we see body_len is the most important feature here

[(0.07854908284227977, 'body_len'),
 (0.035091599702330105, 1804),
 (0.03237434714273834, 3135),
 (0.02595533673942914, 5727),
 (0.02337382433731305, 2032),
 (0.02254166038205978, 4799),
 (0.020172748396741103, 7353),
 (0.019883036373528257, 6749),
 (0.01660604136327466, 6288),
 (0.013840747024401865, 7030)]

In [None]:
y_pred=rd_model.predict(x_test)
precision,recall,fscore,support=score(y_test,y_pred, pos_label='spam',average='binary')