In [308]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [312]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [315]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [317]:
data.sample(20)

Unnamed: 0,text,label
397,"DEAR BELOVED,I am Sussan AdamsPLEASE ENDEAVOUR...",1
124,Pls print.,0
219,We are meeting with you at noon to discuss.,0
960,Pls print.H <hrod17@clintonemail.com>Monday Fe...,0
837,Ok,0
220,FYI Ã¢ÂÂ today the SFRC will take up at its ...,0
230,thanks,0
976,"Dear friend,=20It is indeed my pleasure to wri...",1
885,Cherie Blair <Tuesday September 29 2009 10:18 ...,0
781,"As you read this, I don't want you to feel sor...",1


### Let's divide the training and test set into two partitions

In [320]:
X = data['text']
y = data['label']
#Dividing the training and test sets into two partitions
from sklearn.model_selection import train_test_split

#loading the datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Data Preprocessing

In [323]:
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk

In [325]:
print(f" Punctuations: {string.punctuation}") #provides a string containing all common punctuation marks
print(f"Stopwords: {stopwords.words('english')[100:110]}")#fetches a list of common English stopwords(such as the, is)
snowball = SnowballStemmer('english') #a stemming algorithm that reduces words to their root forms eg running to run

 Punctuations: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Stopwords: ['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


In [327]:
#Testing the stemmer
word = 'running'
print(snowball.stem(word))

run


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [331]:
import re #module in Python is used for working with regular expressions

In [333]:
#define a function data cleaning
def text_cleaning(input):
    #removing inline Javascript(e.g., <script>....)
    input = re.sub(r'<script.*?>.*?</script>', '', input, flags=re.DOTALL)
    #removing Html comments including the regular tags
    input = re.sub(r'<.*?>', '', input)
    #Remove all the special characters
    input = re.sub(r'[^A-Za-z0-9 .,!?\'"-]', '', input)
    #Remove numbers or digits 
    input = re.sub(r'\d+', '', input)
    #Remove all single characters
    input = re.sub(r'\b\w\b', '', input)
    # Remove extra spaces created after removing single characters
    input = re.sub(r'\s+', ' ', input).strip()
    #Remove single characters from the start
    #input = re.sub(r'^\s*\w\s*', '', input)
    #Substitute multiple spaces with single space
    input = re.sub(r'\s+', ' ', input)
    #Remove prefixed 'b'
    input = re.sub(r'^b\s*', '', input, flags=re.IGNORECASE)
    #Convert to Lowercase
    input = input.lower()
    return input

In [335]:
X_train_cleaned = X_train.apply(text_cleaning)
X_test_cleaned = X_test.apply(text_cleaning)

In [336]:
X_train_cleaned.sample(10)

944    'll follow upmight make most sense for wjc to ...
710    greetings you must be aware now that my countr...
742    ..dear friend, am mr mark boland the bank mana...
637    from dr sani mustapha,the manager of auditand ...
378            release in partbbsee orig traffic bi mine
933    dear friendci am mrevincent nicholasc am an ac...
812    accept certified and notable bank cheques from...
876                  already talked megan about changes.
134                                                  fyi
204    dear sir, am pleased to introduce business opp...
Name: text, dtype: object

## Now let's work on removing stopwords
Remove the stopwords.

In [340]:
#download 
nltk.download("stopwords")
#define a function for stopwords removal
def remove_stopwords(input):
    #get the list of the common English stopwords
    stop_words = set(stopwords.words('english'))
    #split the text into words
    words = input.split()
    #filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    #join the filtered words back into a string
    return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\somoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [342]:
X_train_without_stopwords = X_train_cleaned.apply(remove_stopwords)
X_test_without_stopwords = X_test_cleaned.apply(remove_stopwords)

In [344]:
X_test_without_stopwords

521    dear sirc wish go offer consider partnerei mre...
737    take mind balkans second see great plug global...
740                             pls keep updates coming!
660    christ bethel hospital rue abobote,abidjanivor...
411    sbwhoeopfriday february amhre bravo! brava! is...
                             ...                        
408                sorry yes exactlywe shy tomorrow too.
332    dearcgood dayei know message come suprise cons...
208                                                  fyi
613    greetings dear friend please permit contact me...
78                    car way airport. talk? call berry.
Name: text, Length: 200, dtype: object

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [347]:
import nltk
nltk.download('punkt') #punkt tokenizer is a pre-trained model in NLTK used for splitting text into sentences and words.
nltk.download('omw-1.4')  # WordNet's multilingual support
nltk.download('averaged_perceptron_tagger')  # For POS tagging
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\somoy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\somoy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\somoy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\somoy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [349]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#create a function fo the Part of speech tag
def get_wordnet_pos(treebank_tag):
    #convert treebank POS tags to WordNet POS tag
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN #using Noun as default

#define the lemmatization function, the POS tag helps to define a better lemmatization
def lemmatize_with_pos(input_text):
    words = word_tokenize(input_text)
    lemmatizer = WordNetLemmatizer()
    #Get POS tag for words
    pos_tags = pos_tag(words)
    #lemmatize with POS tags
    lemmatized_words = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos)) for word, pos in pos_tags]
    return lemmatized_words

In [351]:
X_train_lemmatized = X_train_without_stopwords.apply(lemmatize_with_pos)
X_test_lemmatized = X_test_without_stopwords.apply(lemmatize_with_pos)

In [352]:
# put all this cleaning together

def re_fresh(row):
  return " ".join(row)
    
X_train_fresh = X_train_lemmatized.apply(re_fresh)
X_test_fresh = X_test_lemmatized.apply(re_fresh)

In [373]:
#Join X_train, X_tain_cleaned, X_train_without_stopwords and X_train_lemmatized into a dataframe table
X_train_combined = pd.concat([X_train, X_train_cleaned, X_train_without_stopwords, X_train_lemmatized, X_train_fresh], axis = 1)
X_test_combined = pd.concat([X_test, X_test_cleaned, X_test_without_stopwords, X_test_lemmatized, X_test_fresh], axis = 1)
#rename the columns
X_train_combined.columns = ['Original', 'Cleaned', 'Without_Stopwords', 'Lemmatized', 'Fresh data']
X_test_combined.columns = ['Original', 'Cleaned', 'Without_Stopwords', 'Lemmatized', 'Fresh data']
#view 
X_train_combined.sample(10)

Unnamed: 0,Original,Cleaned,Without_Stopwords,Lemmatized,Fresh data
556,Below is Palau's statement on the recent meeti...,elow is palau' statement on the recent meeting...,elow palau' statement recent meeting required ...,"[elow, palau, ', statement, recent, meeting, r...",elow palau ' statement recent meeting require ...
472,FYI below on French statement on Haiti and U.S...,fyi below on french statement on haiti and .. ...,fyi french statement haiti .. supportparis jan...,"[fyi, french, statement, haiti, .., supportpar...",fyi french statement haiti .. supportparis jan...
514,ONE HUNDRED AND FIFTY TWO MILLION DOLLARS...,one hundred and fifty two million dollars dear...,one hundred fifty two million dollars dear sir...,"[one, hundred, fifty, two, million, dollar, de...",one hundred fifty two million dollar dear sirm...
807,"1944 Ã¯Â¿Â½ February 14, 2005} married to Naze...","february , married to nazek audi hariri, was l...","february , married nazek audi hariri, lebanese...","[february, ,, married, nazek, audi, hariri, ,,...","february , married nazek audi hariri , lebanes..."
390,From the Desk of =3A Dr biko zulato=5F=5F=5...,from the desk of dr biko zulatofffffffffffffff...,desk dr biko zulatoffffffffffffffffff fffffff ...,"[desk, dr, biko, zulatoffffffffffffffffff, fff...",desk dr biko zulatoffffffffffffffffff fffffff ...
82,He released his hold. No need for the call.,he released his hold. no need for the call.,released hold. need call.,"[release, hold, ., need, call, .]",release hold . need call .
776,{ONE HUNDRED AND TWENTY SIX MILLION DOLLARS)De...,one hundred and twenty six million dollarsdear...,one hundred twenty six million dollarsdear sir...,"[one, hundred, twenty, six, million, dollarsde...",one hundred twenty six million dollarsdear sir...
782,Great interview w/ her on cnn or msnbc where s...,great interview her on cnn or msnbc where she ...,great interview cnn msnbc talked impt strategi...,"[great, interview, cnn, msnbc, talk, impt, str...",great interview cnn msnbc talk impt strategic ...
628,"The Director, SEEKING FOR IMMEDIATE...","the director, seeking for immediate assistance...","director, seeking immediate assistance transfe...","[director, ,, seek, immediate, assistance, tra...","director , seek immediate assistance transfer ..."
888,Remind me to discuss when we talk,remind me to discuss when we talk,remind discuss talk,"[remind, discus, talk]",remind discus talk


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [365]:
from sklearn.feature_extraction.text import CountVectorizer
##Create a bag of words using top 10 words

##let's take only the most common 10 words
bow_vect = CountVectorizer(max_features=10)

# Fit and transform the training data
X_train_data = bow_vect.fit_transform(X_train_fresh.tolist()).toarray()

# Transform the test data
X_test_data = bow_vect.transform(X_test_fresh.tolist()).toarray()

Training Data BOW Matrix:
[[0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [5 1 1 ... 1 4 2]
 [4 4 2 ... 0 2 2]
 [0 0 0 ... 0 0 0]]


## Extra features

In [381]:
"""
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

X_train_combined['money_mark'] = X_train_combined['Fresh data'].str.contains(money_simbol_list)*1
X_train_combined['suspicious_words'] = X_train_combined['Fresh data'].str.contains(suspicious_words)*1
X_train_combined['text_len'] = X_train_combined['Fresh data'].apply(lambda x: len(x)) 

X_test_combined['money_mark'] = X_test_fresh['Fresh data'].str.contains(money_simbol_list)*1
X_test_combined['suspicious_words'] = X_test_fresh['Fresh data'].str.contains(suspicious_words)*1
X_test_combined['text_len'] = X_test_fresh['Fresh data'].apply(lambda x: len(x)) 

KeyError: 'Fresh data'

## How would you create a Bag of Words with the CountVectorizer method?

In [None]:
# Your code

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code

### Extra Task (optional) - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

Use a MultinimialNB with default parameters.

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [386]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2,random_state=100)
kmeans.fit(X_train_data)
pred = kmeans.predict(X_test_data)

In [390]:
pred-y_test

521    0
737    1
740    1
660   -1
411    1
      ..
408    1
332    0
208    1
613    0
78     1
Name: label, Length: 200, dtype: int64