In [1]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import nltk
import re
import pandas as pd
import nltk.corpus.reader.wordnet as wordnet
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hibi9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("../data/kg_train.csv", encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("", inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [4]:
# I will split them later, after all the cleanup below, to prevent doing this on 2 datas

## Data Preprocessing

In [5]:
import string
from nltk.corpus import stopwords

print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer

snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [6]:
def remove_html_elements(text):
    # Remove inline CSS
    text = re.sub(r'<style.*?</style>', '', text)
    text = re.sub(r'(<[^>]+)\sstyle=".*?"', r'\1', text)

    # Remove Javascript
    text = re.sub(r'<script.*?</script>', '', text)

    # Remove HTML comments
    text = re.sub(r'', '', text)

    # Remove remaining HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove leading or ending spaces
    return text.strip()


data['preprocessed_text'] = data['text'].apply(remove_html_elements)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [7]:
def clean_text(text):
    # Remove special Characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove all single caracters
    text = re.sub(r'\b\w\b', '', text)

    # Remove single caracters at the beginning
    text = re.sub(r'^\w\s', '', text)

    # replace multiple spaces with just 1
    text = re.sub(r'\s+', ' ', text)

    # Remove prefix "b"
    text = re.sub(r'^b\s', '', text)

    # to lowercase
    text = text.lower()

    return text


data['preprocessed_text'] = data['preprocessed_text'].apply(clean_text)

## Now let's work on removing stopwords
Remove the stopwords.

In [8]:
words = stopwords.words('english')


def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in words])
    return text


data['preprocessed_text'] = data['preprocessed_text'].apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer


def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0]
    tag_dict = {"J": wordnet.ADJ,
                "V": wordnet.VERB,
                "N": wordnet.NOUN,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(line):
    final_doc = []
    for word in line.split():
        final_doc.append(WordNetLemmatizer().lemmatize(word, pos=get_wordnet_pos(word)))
    return " ".join(final_doc)


data['preprocessed_text'] = data['preprocessed_text'].apply(lemmatize)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [10]:
data.head()

Unnamed: 0,text,label,preprocessed_text
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal mi...
1,Will do.,0,
2,Nora--Cheryl has emailed dozens of memos about...,0,noracheryl email dozen memo haiti weekend plea...
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sirfmadamc know proposal might surprise e...
4,fyi,0,fyi


In [11]:
from collections import Counter

ham_texts = data[data['label'] == 0]
spam_texts = data[data['label'] == 1]

# Count the words in both textas
ham_word_counts = Counter(" ".join(ham_texts['preprocessed_text']).split())
spam_word_counts = Counter(" ".join(spam_texts['preprocessed_text']).split())

# Top 10 words
top_10_ham_words = ham_word_counts.most_common(10)
top_10_spam_words = spam_word_counts.most_common(10)

In [12]:
# now I split the data

data_train, data_val = train_test_split(data, test_size=0.3, random_state=42, stratify=data['label'])

## Extra features

In [13]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro", "dollar", "pound", "€", "$"])
suspicious_words = "|".join(["free", "cheap", "sex", "money", "account", "bank", "fund", "transfer", "transaction", "win", "deposit", "password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list) * 1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words) * 1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x))

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list) * 1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words) * 1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x))

data_train.head()

Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
381,Mills Cheryl D <MillsCD@state.gov>Sunday Janua...,0,mill cheryl sunday january pmtravel scheduleca...,1,0,104
428,H <hrod17@clintonemail.com >Saturday January 2...,0,saturday january pmsbwhoeopre fyi foreign nati...,1,0,115
849,"DEAR, MY NAME IS MR MR Ken Edward,A former...",1,dear name mr mr ken edwarda former government ...,1,1,875
252,"Dear Sir, I am Engr. Victor Chigoziem with the...",1,dear sir engr victor chigoziem engineering sto...,1,1,2224
380,"Hello,This is Dr.Clive Whittaker. I work for F...",1,hellothis drclive whittaker work fidelity inve...,1,1,852


## How would work the Bag of Words with Count Vectorizer concept?

In [14]:
count_vectorizer = CountVectorizer(min_df=1)

cv_ham = count_vectorizer.fit_transform(ham_texts['preprocessed_text'])
cv_spam = count_vectorizer.fit_transform(spam_texts['preprocessed_text'])

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [15]:
tfidfvector = TfidfVectorizer(ngram_range=(2, 2))
tfidf_ham = tfidfvector.fit_transform(ham_texts['preprocessed_text'])
tfidf_spam = tfidfvector.fit_transform(spam_texts['preprocessed_text'])

print(tfidf_ham.shape)
print(tfidf_spam.shape)

(558, 17087)
(442, 46010)


## And the Train a Classifier?

In [16]:
all_texts = pd.concat([ham_texts['preprocessed_text'], spam_texts['preprocessed_text']], ignore_index=True)
all_classes = pd.concat([ham_texts['label'], spam_texts['label']], ignore_index=True)

vectorizer = TfidfVectorizer()

vectorized_data = vectorizer.fit_transform(all_texts)

X_train, X_test, y_train, y_test = train_test_split(vectorized_data, all_classes, test_size=0.2, random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
  
y_pred = classifier.predict(X_test)
  
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy", accuracy)

Accuracy 0.915


### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [17]:
# Your code