# Natural Language Processing

In [None]:
# import required libraries

import pandas as pd
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Read the data
data = pd.read_csv('C:/Users/jerry/Work/NLP/data/spam_text_message_data.csv')
data.head()

In [None]:
data.shape

## Text cleaning and processing steps:
> Remove punctuations <br>
> Convert text to tokens<br>
> Remove tokens of length less than or equal to 3 <br>
> Remove stopwords using NLTK corpus stopwords list to match <br>
> Apply stemming <br>
> Apply lemmatization <br>

In [None]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [None]:
data['removed_punc'] = data['Message'].apply(lambda x: remove_punct(x))
data.head()

In [None]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [None]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

In [None]:
# Remove tokens of length less than 3
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [None]:
data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

In [None]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [None]:
data['clean_tokens'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

In [None]:
# Apply stemming to get root words 
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]

In [None]:
data['stem_words'] = data['clean_tokens'].apply(lambda wrd: stemming(wrd))
data.head()

In [None]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [None]:
data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

In [None]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()