# Natural Language Processing (NLP)

In [None]:
# Libraries
import nltk
from nltk.corpus import stopwords

In [None]:
# Examples of some stop words in English
stopwords.words('english')[0:1000:25]

### First Example

In [None]:
import pandas as pd

#Pulling in the data and looking at the top rows
messages = pd.read_csv('/Users/jared/Downloads/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding = 'latin-1')
messages.head()

In [None]:
#Clean up the data
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

In [None]:
# Summary stats on our data
messages.shape


In [None]:
# We have a lot more ham values than ham values
messages['label'].value_counts()

In [None]:
# Missing Data
print('Number of nulls in label: {}'.format(messages['label'].isnull().sum()))
print('Number of nulls in text: {}'.format(messages['text'].isnull().sum()))

### Pre-Processing Text Data

In [None]:
import pandas as pd

#Changing the way the tables will be displayed so that we can read more of the data this time around
pd.set_option('display.max_colwidth', 100)

messages.head()

In [None]:
#Remove Punctuation

#To do this, we need to show python what punctuation looks like
#This library has a package called "puncatuation" that we can use for this step
import string

#Showing the punctuation
string.punctuation

In [None]:
#The reason we do this is to remove noise from the data
"This message is spam" == "This message is spam."

In [None]:
#Building a function to remove punctuation
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

messages['text_clean'] = messages['text'].apply(lambda x: remove_punct(x))
messages.head()

In [None]:
# Tokenization - splitting our sentences into a list of words
import re

# \W+ will split a text wherever it sees one or more non-word characters (white space, special characters, etc.)
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# We apply our function and then lower case all our words because python is case sensitive
messages['text_tokenized'] = messages['text_clean'].apply(lambda x: tokenize(x.lower()))

messages.head()

In [None]:
# Remove Stop Words
import nltk

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
#Define a function to remove the stop words
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

messages['text_nostop'] = messages['text_tokenized'].apply(lambda x: remove_stopwords(x))

messages.head()

### Term Frequency - Inverse Document Frequency (TF-IDF)
- This creates a document-term matrix; one row per document, one column per word in the corpus
- Generates a weighting for each word/document pair intended to reflect how important a given word is to the document within the context of its frequency within a larger corpus

In [None]:
#Libraries
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

#Reformat Data
mess = pd.read_csv('/Users/jared/Downloads/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding = 'latin-1')
mess = mess.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
mess.columns = ["label", "text"]
mess.head()

In [None]:
#One function to pre-process the data
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [None]:
#Fitting a basic TFIDF Vectorizer and view the results
from sklearn.feature_extraction.text import TfidfVectorizer

#This will clean the data, fit it in a vectorizer, then create our document form matrix
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect.fit_transform(mess['text'])