In [17]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [19]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

display(data)

(1000, 2)


Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0
...,...,...
995,So what's the latest? It sounds contradictory ...,0
996,"TRANSFER OF 36,759,000.00 MILLION POUNDS TO YO...",1
997,Barb I will call to explain. Are you back in t...,0
998,Yang on travelNot free tonite.May work tomorrow,0


### Let's divide the training and test set into two partitions

In [20]:
# Your code
import sklearn
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data["text"], data["label"], test_size=.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800,) (200,) (800,) (200,)


## Data Preprocessing

In [21]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from bs4 import Comment
import warnings

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

def remove_tag(text, tags):
    soup = BeautifulSoup(text, 'html.parser')
    for data in soup(tags):
        data.decompose()
    return ' '.join(soup.stripped_strings)

def remove_html_comments(text):
    soup = BeautifulSoup(text, 'html.parser')
    comments = soup.find_all(string=lambda x: isinstance(x, Comment))
    
    for c in comments:
        c.extract()
    return ' '.join(soup.stripped_strings)

def html_to_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

# Remove Javascript and CSS
X_train_cln01 = X_train.apply(lambda x: remove_tag(x, ["script", "style"]))
X_test_cln01  = X_test.apply(lambda x: remove_tag(x, ["script", "style"]))

# Remove Comments
X_train_cln01 = X_train_cln01.apply(lambda x: remove_html_comments(x))
X_test_cln01  = X_test_cln01.apply(lambda x: remove_html_comments(x))

# Remove all other tags
X_train_cln01 = X_train_cln01.apply(lambda x: html_to_text(x))
X_test_cln01  = X_test_cln01.apply(lambda x: html_to_text(x))

print("\nReview 10 first lines before and after:\n")
for i in range(0,10):
    print(f"{i}b:", X_train.iloc[i])
    print(f"{i}a:", X_train_cln01.iloc[i], '\n')


Review 10 first lines before and after:

0b: Goodday,Thanks for your response to my email. How are you today and business inyour country? I am Mr. Chen Jiang, Bank Manager of a Bank in China.I am contacting you concerning a deceased customer that bears the samesurname as you and an investment he placed under our banks managementyears ago. I would respectfully request that you keep the contents of thismail confidential and respect the integrity of the information you come byas a result of this mail.I contacted you independently of our investigation and no one is informedof this communication. I would like to intimate you with certain factsthat I believe would be of interest to you. Please reply back.Chen Jiang.
0a: Goodday,Thanks for your response to my email. How are you today and business inyour country? I am Mr. Chen Jiang, Bank Manager of a Bank in China.I am contacting you concerning a deceased customer that bears the samesurname as you and an investment he placed under our banks 

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code
import re
def remove_special_chars(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text) 

def remove_numbers(text):
    return re.sub(r'[0-9]', ' ', text)

def remove_single_chars(text):
    regex = r'^(\w{1}\s)|(\s\w{1}\s)|(\s\w{1})$'
    while re.findall(regex, text):
        text = re.sub(regex, ' ', text)
    return text

def remove_double_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Remove Special Characters
X_train_cln02 = X_train_cln01.apply(lambda x: remove_special_chars(x)) 
X_test_cln02  = X_test_cln01.apply(lambda x: remove_special_chars(x)) 

# Remove Numbers
X_train_cln02 = X_train_cln02.apply(lambda x: remove_numbers(x)) 
X_test_cln02  = X_test_cln02.apply(lambda x: remove_numbers(x)) 

# Remove all single characters (including the ones in the start and end)
X_train_cln02 = X_train_cln02.apply(lambda x: remove_single_chars(x)) 
X_test_cln02  = X_test_cln02.apply(lambda x: remove_single_chars(x)) 

# Substitute multiple spaces
X_train_cln02 = X_train_cln02.apply(lambda x: remove_double_spaces(x)) 
X_test_cln02  = X_test_cln02.apply(lambda x: remove_double_spaces(x)) 

# Remove prefixed 'b'
# With the clean done before there should not be any let's double check
count = 0
for text in X_train_cln02:
    count += len(re.findall(r'\'b\'', text))
for text in X_test_cln02:
    count += len(re.findall(r'\'b\'', text))
print("How many 'b' are in the dataset?", count)

# To lowercase
X_train_cln02 = X_train_cln02.apply(lambda x: x.lower()) 
X_test_cln02  = X_test_cln02.apply(lambda x: x.lower()) 

print("\nReview 10 first lines before and after:\n")
for i in range(0,10):
    print(f"{i}b:", X_train_cln01.iloc[i])
    print(f"{i}a:", X_train_cln02.iloc[i], '\n')



How many 'b' are in the dataset? 0

Review 10 first lines before and after:

0b: Goodday,Thanks for your response to my email. How are you today and business inyour country? I am Mr. Chen Jiang, Bank Manager of a Bank in China.I am contacting you concerning a deceased customer that bears the samesurname as you and an investment he placed under our banks managementyears ago. I would respectfully request that you keep the contents of thismail confidential and respect the integrity of the information you come byas a result of this mail.I contacted you independently of our investigation and no one is informedof this communication. I would like to intimate you with certain factsthat I believe would be of interest to you. Please reply back.Chen Jiang.
0a: goodday thanks for your response to my email how are you today and business inyour country am mr chen jiang bank manager of bank in china am contacting you concerning deceased customer that bears the samesurname as you and an investment he 

## Now let's work on removing stopwords
Remove the stopwords.

In [24]:
from nltk.stem.wordnet import WordNetLemmatizer
def remove_stop_words(text, language="english"):
    stop_words = set(stopwords.words(language))
    wordnet_lemma = WordNetLemmatizer()

    result = []
    for word in text.split():
        if not word in stop_words:
            result.append(word)

    return ' '.join(result)


X_train_cln03 = X_train_cln02.apply(lambda x: remove_stop_words(x)) 
X_test_cln03  = X_test_cln02.apply(lambda x: remove_stop_words(x)) 

print("\nReview 10 first lines before and after:\n")
for i in range(0,10):
    print(f"{i}b:", X_train_cln02.iloc[i])
    print(f"{i}a:", X_train_cln03.iloc[i], '\n')


Review 10 first lines before and after:

0b: goodday thanks for your response to my email how are you today and business inyour country am mr chen jiang bank manager of bank in china am contacting you concerning deceased customer that bears the samesurname as you and an investment he placed under our banks managementyears ago would respectfully request that you keep the contents of thismail confidential and respect the integrity of the information you come byas result of this mail contacted you independently of our investigation and no one is informedof this communication would like to intimate you with certain factsthat believe would be of interest to you please reply back chen jiang 
0a: goodday thanks response email today business inyour country mr chen jiang bank manager bank china contacting concerning deceased customer bears samesurname investment placed banks managementyears ago would respectfully request keep contents thismail confidential respect integrity information come by

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code
import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0]
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) 

def lematize(text):
    wordnet_lemma = WordNetLemmatizer()
    result = []
    for word in text.split():
        result.append(wordnet_lemma.lemmatize(word, pos=get_wordnet_pos(word)).lower())
    
    return ' '.join(result)


X_train_cln04 = X_train_cln03.apply(lambda x: lematize(x)) 
X_test_cln04  = X_test_cln03.apply(lambda x: lematize(x)) 

print("\nReview 10 first lines before and after:\n")
for i in range(0,10):
    print(f"{i}b:", X_train_cln03.iloc[i])
    print(f"{i}a:", X_train_cln04.iloc[i], '\n')


Review 10 first lines before and after:

0b: goodday thanks response email today business inyour country mr chen jiang bank manager bank china contacting concerning deceased customer bears samesurname investment placed banks managementyears ago would respectfully request keep contents thismail confidential respect integrity information come byas result mail contacted independently investigation one informedof communication would like intimate certain factsthat believe would interest please reply back chen jiang
0a: goodday thanks response email today business inyour country mr chen jiang bank manager bank china contact concern decease customer bear samesurname investment place bank managementyears ago would respectfully request keep content thismail confidential respect integrity information come byas result mail contact independently investigation one informedof communication would like intimate certain factsthat believe would interest please reply back chen jiang 

1b: even email ca

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [26]:
# Your code
# NOT SPAM
print("Most common words in ham")
train_corpus = " ".join(X_train_cln04[X_train_cln04.index.isin(list(y_train[y_train==0].index))])
display(pd.Series(train_corpus.split(" ")).value_counts()[:10])

# SPAM
#X_train_cln04[X_train_cln04.index.isin(list(y_train[y_train==1].index))]
print("Most common words in spam")
train_corpus = " ".join(X_train_cln04[X_train_cln04.index.isin(list(y_train[y_train==1].index))])
display(pd.Series(train_corpus.split(" ")).value_counts()[:10])


Most common words in ham


pm           122
state        111
work          95
would         94
president     94
call          89
obama         83
time          82
say           79
mr            76
Name: count, dtype: int64

Most common words in spam


money          782
account        744
fund           645
bank           638
u              587
transfer       472
transaction    438
country        420
business       402
mr             399
Name: count, dtype: int64

## Extra features

In [27]:
data_train = pd.DataFrame(X_train_cln04)
data_train.columns = ["preprocessed_text"]

data_val   =  pd.DataFrame(X_test_cln04)
data_val.columns = ["preprocessed_text"]


In [28]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

Unnamed: 0,preprocessed_text,money_mark,suspicious_words,text_len
817,goodday thanks response email today business i...,1,1,458
851,even email call korea,1,0,21
92,checkingi sure willmitchell respond talk tonit...,1,0,66
833,dear sir madam princess adama williams daughte...,1,1,1380
179,dear sir madam letter may come surprise due fa...,1,1,1521


## How would work the Bag of Words with Count Vectorizer concept?

In [29]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

# Create the Bag of Words model
count_vect = CountVectorizer()
X_train_cv = count_vect.fit_transform(list(X_train_cln04))

# Show the Bag of Words feature names and the document-term matrix
print("Feature Names:", count_vect.get_feature_names_out())
print("Document-Term Matrix:\n", X_train_cv.toarray())

Feature Names: ['aa' 'aaa' 'aabeiawaeaambiqaceqedeqh' ... 'zzz' 'zzzahbxntxe' 'zzzj']
Document-Term Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [30]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the Bag of Words model
tfidf_vect = TfidfVectorizer(max_features=1000, stop_words="english")
X_train_tdidf = tfidf_vect.fit_transform(list(X_train_cln04))

# Show the Bag of Words feature names and the document-term matrix
print("Feature Names:", tfidf_vect.get_feature_names_out())
print("Document-Term Matrix:\n", X_train_tdidf.toarray())

Feature Names: ['ab' 'abacha' 'abandon' 'abidjan' 'able' 'abroad' 'ac' 'accept'
 'acceptance' 'access' 'accord' 'account' 'accountant' 'accounting'
 'acknowledge' 'acquire' 'act' 'action' 'actual' 'actually' 'ad' 'add'
 'address' 'administration' 'advice' 'advise' 'ae' 'af' 'afghan'
 'afghanistan' 'africa' 'african' 'age' 'agent' 'agf' 'ago' 'agree'
 'agreement' 'ah' 'ai' 'aid' 'airline' 'airport' 'aj' 'ak' 'al' 'allah'
 'allow' 'ally' 'alternative' 'ambassador' 'america' 'american'
 'anticipate' 'ap' 'application' 'applies' 'apply' 'appreciate' 'approval'
 'approve' 'april' 'ar' 'area' 'arm' 'arrange' 'arrangement' 'arrest'
 'arrival' 'arrive' 'aside' 'ask' 'asset' 'assist' 'assistance'
 'assistant' 'associate' 'assurance' 'assure' 'asylum' 'attach' 'attack'
 'attempt' 'attention' 'attorney' 'au' 'audit' 'auditor' 'august'
 'authority' 'available' 'average' 'avoid' 'aw' 'await' 'award' 'aware'
 'away' 'ax' 'az' 'bad' 'balance' 'bank' 'banker' 'banking' 'barack'
 'barrister' 'base' 'bb

## And the Train a Classifier?

In [32]:
# Your code
model = sklearn.linear_model.LogisticRegression(random_state=42)

# Fit the model
model.fit(X_train_tdidf, y_train)

model_predict = model.predict(tfidf_vect.transform(list(X_test_cln04)))


print("Accuracy:  ", sklearn.metrics.accuracy_score(y_test, model_predict))
print("Precision: ", sklearn.metrics.precision_score(y_test, model_predict))
print("Recall:    ", sklearn.metrics.recall_score(y_test, model_predict))
print("F1:        ", sklearn.metrics.f1_score(y_test, model_predict))

Accuracy:   0.975
Precision:  1.0
Recall:     0.9382716049382716
F1:         0.9681528662420382


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [43]:
# Your code
data = pd.read_csv("../data/kg_train.csv",encoding='latin-1')

def data_clean(x):
    x = remove_tag(x, tags=["script", "style"])
    x = remove_html_comments(x)
    x = html_to_text(x)
    x = remove_special_chars(x)
    x = remove_numbers(x)
    x = remove_double_spaces(x)
    x = remove_stop_words(x)
    return lematize(x)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data["text"], data["label"], test_size=.2)

X_train_clean = X_train.apply(lambda x: data_clean(x)) 
X_test_clean = X_test.apply(lambda x: data_clean(x)) 


vectorizers = [sklearn.feature_extraction.text.CountVectorizer(),
               sklearn.feature_extraction.text.TfidfVectorizer(),
               sklearn.feature_extraction.text.TfidfVectorizer(max_features=10000, stop_words="english"),
               sklearn.feature_extraction.text.TfidfVectorizer(max_features=5000, stop_words="english"),
               sklearn.feature_extraction.text.TfidfVectorizer(max_features=1000, stop_words="english"),
               sklearn.feature_extraction.text.TfidfVectorizer(max_features=500, stop_words="english")]

results = []
for vectorizer in vectorizers:
    vectorizer.fit(X_train_clean)

    model = sklearn.naive_bayes.MultinomialNB()
    model.fit(vectorizer.transform(X_train), y_train)

    train_pred = model.predict(vectorizer.transform(X_train))
    test_pred  = model.predict(vectorizer.transform(X_test))

    result = {}
    result["Accuracy Train"] = sklearn.metrics.accuracy_score(y_train, train_pred)
    result["Precision Train"] = sklearn.metrics.precision_score(y_train, train_pred)
    result["Recall Train"] = sklearn.metrics.recall_score(y_train, train_pred)
    result["F1 Train"] = sklearn.metrics.f1_score(y_train, train_pred)
    result["Accuracy Test"] = sklearn.metrics.accuracy_score(y_test, test_pred)
    result["Precision Test"] = sklearn.metrics.precision_score(y_test, test_pred)
    result["Recall Test"] = sklearn.metrics.recall_score(y_test, test_pred)
    result["F1 Test"] = sklearn.metrics.f1_score(y_test, test_pred)

    results.append(result)

pd.DataFrame(results)

Unnamed: 0,Accuracy Train,Precision Train,Recall Train,F1 Train,Accuracy Test,Precision Test,Recall Test,F1 Test
0,0.95305,0.918107,0.978661,0.947418,0.937133,0.886165,0.98062,0.931003
1,0.93712,0.886404,0.980116,0.930907,0.917016,0.84808,0.984496,0.911211
2,0.982813,0.986248,0.973812,0.97999,0.9715,0.958175,0.976744,0.96737
3,0.979878,0.986152,0.967022,0.976494,0.973177,0.965385,0.972868,0.969112
4,0.974638,0.977843,0.963143,0.970437,0.963956,0.953935,0.963178,0.958534
5,0.969818,0.973346,0.956353,0.964775,0.961442,0.948473,0.963178,0.955769
