In [44]:
import nltk 
import string
import re
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from progressbar import ProgressBar
import random

In [2]:
stop_words = stopwords.words("english")
df = pd.read_csv("./spam.csv", header=0, encoding="latin-1", names=["spam", "text", "1", "2", "3"]).fillna("")
df["text"] = df['text']+df["1"]+df["2"]+df["3"]
df.drop(["1", "2", "3"], axis=1, inplace=True)
df["spam"] = df["spam"].map({"ham":0, "spam":1})
df.head()

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    for word in stop_words:
        text.replace(word, "")
    stemmer = nltk.SnowballStemmer("english")
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text
    
    
df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,spam,text
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in a wkli comp to win fa cup final...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


In [4]:
df_temp, df_test = train_test_split(df, test_size=.2, stratify=df["spam"])
df_train, df_validation = train_test_split(df_temp, test_size=1.0/8.0, stratify=df_temp["spam"])
df_train = df_train.copy()
df_test = df_test.copy()
df_validation = df_validation.copy()
del df_temp

In [5]:
df_train.head()

Unnamed: 0,spam,text
801,0,appt is at lttimegt am not my fault u dont lis...
4459,0,this is wish you a great day moji told me abou...
4852,0,im fine hope you are also
1647,0,even v good if somewhat event laden will fill...
1589,0,i love you you set my soul on fire it is not j...


In [6]:
vect = CountVectorizer()
vect.fit(df_train["text"])
train_encoded = vect.transform(df_train["text"])
test_encoded = vect.transform(df_test["text"])
validation_encoded = vect.transform(df_validation["text"])

In [None]:
bar = ProgressBar()
def iterate(train, test, validation, n = 1000):
    for i in bar(range(n)):
        run = {}
        run["binary"] = random.randint(0,1)
        vect = CountVectorizer(binary=run["binary"])
        train_encoded = vect.transform(train["text"])
        test_encoded = vect.transform(test["text"])
        validation_encoded = vect.transform(validation["text"])
        
        run["tfidf"] = random.randint(0,1) if not binary else 0
        if run["tfidf"]:
            tfidf_transformer = TfidfTransformer()
            tfidf_transformer.fit(train_encoded)
        
        
    
    
    
# tfidf_transformer = TfidfTransformer()
# tfidf_transformer.fit(x_train_dtm)
# x_train_tfidf = tfidf_transformer.transform(x_train_dtm)

In [9]:
clf = DecisionTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2)

DecisionTreeClassifier()

In [None]:
clf.fit(train_encoded, df_train["spam"])

In [41]:
train_info = {"confusion_matrix": confusion_matrix(df_train["spam"], clf.predict(train_encoded)), 
              "report": classification_report(df_train["spam"], clf.predict(train_encoded), output_dict=True)}
train_info

{'confusion_matrix': array([[3376,    0],
        [   0,  523]]),
 'report': {'0': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 3376},
  '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 523},
  'accuracy': 1.0,
  'macro avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 3899},
  'weighted avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 3899}}}

In [39]:
validation_info = {"confusion matrix": confusion_matrix(df_validation["spam"], clf.predict(validation_encoded)), 
                   "report": classification_report(df_validation["spam"], clf.predict(validation_encoded), output_dict=True)}
validation_info

{'confusion matrix': array([[468,  15],
        [ 12,  63]]),
 'report': {'0': {'precision': 0.975,
   'recall': 0.968944099378882,
   'f1-score': 0.9719626168224299,
   'support': 483},
  '1': {'precision': 0.8076923076923077,
   'recall': 0.84,
   'f1-score': 0.8235294117647058,
   'support': 75},
  'accuracy': 0.9516129032258065,
  'macro avg': {'precision': 0.8913461538461538,
   'recall': 0.904472049689441,
   'f1-score': 0.8977460142935678,
   'support': 558},
  'weighted avg': {'precision': 0.9525124069478907,
   'recall': 0.9516129032258065,
   'f1-score': 0.952011917217897,
   'support': 558}}}

In [40]:
test_info = {"confusion matrix": confusion_matrix(df_test["spam"], clf.predict(test_encoded)),
             "report": classification_report(df_test["spam"], clf.predict(test_encoded), output_dict=True)}
test_info

{'confusion matrix': array([[945,  21],
        [ 25, 124]]),
 'report': {'0': {'precision': 0.9742268041237113,
   'recall': 0.9782608695652174,
   'f1-score': 0.9762396694214875,
   'support': 966},
  '1': {'precision': 0.8551724137931035,
   'recall': 0.8322147651006712,
   'f1-score': 0.8435374149659864,
   'support': 149},
  'accuracy': 0.9587443946188341,
  'macro avg': {'precision': 0.9146996089584074,
   'recall': 0.9052378173329443,
   'f1-score': 0.909888542193737,
   'support': 1115},
  'weighted avg': {'precision': 0.9583172936669755,
   'recall': 0.9587443946188341,
   'f1-score': 0.9585063636691382,
   'support': 1115}}}

In [33]:
clf.get_depth()

58