## Natural Language Processing

In this exercise we will attempt to classify text messages as "SPAM" or "HAM" using TF-IDF Vectorization 

In [105]:
#import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [106]:
#read in data
df_messages = pd.read_csv('spam.csv', usecols=[0,1])

In [107]:
#convert string labels to 1 or 0 
le = LabelEncoder()
df_messages['target'] = le.fit_transform(df_messages['v1'])

In [108]:
#create a function that takes in a string and returns a list of tokens or words 
def tokenize(string):
    '''
    Tokenizes a string
    
    Parameters
    ----------
    string: str object
        String object to tokenize
    Returns
    --------
    tokens : list
        A list containing each word in string as an element 

    '''
    
    tokens = word_tokenize(string)
    return tokens  

In [109]:
#create a function to remove stopwords and punctuation from a list of tokens
def remove_stopwords(tokens): 
    '''
    Removes stopwords from a list of tokens (words)
    
    Parameters
    ----------
    tokens: list object
        List of tokens that need stopwords removed
    Returns
    --------
    stopwords_removed : list object
        A list containing tokens with stopwords removed

    '''
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    stopwords_removed = [token.lower().replace('�','') for token in tokens if token not in stopwords_list]
    return stopwords_removed

In [110]:
#apply tokenization and stop word removal to our dataset 
df_messages['tokens'] = df_messages['v2'].apply(lambda x: tokenize(x))
df_messages['stopwords_removed'] = df_messages['tokens'].apply(lambda x: remove_stopwords(x))

In [111]:
#create a function that outputs the frequency of the n most common words
def frequency_distribution(tokens, n):
    '''
    Get n most common words in a Series of tokens
    
    Parameters
    ----------
    tokens: pandas.Series object
        Pandas series of tokens 
    n : int object
        Integer defining the number of most common words to return
    Returns
    --------
    most_common : list object
        An array of tuples containing word frequency for n most common words

    '''
    messages_concat = []
    for message in tokens:
        messages_concat += message
    messages_freqdist = FreqDist(messages_concat)
    most_common = messages_freqdist.most_common(n)
    return most_common

In [112]:
frequency_distribution(df_messages['stopwords_removed'], 10)

[('i', 1952),
 ('...', 1233),
 ('u', 1118),
 ('call', 576),
 ("'s", 492),
 ('2', 485),
 ("'m", 395),
 ('get', 385),
 ('ur', 381),
 ("n't", 361)]

In [113]:
#generate tf-idf vectorization for our data (split data into train and test here)
def tfidf(X, y): 
    '''
    Generate train and test TF-IDF vectorization for our data set
    
    Parameters
    ----------
    X: pandas.Series object
        Pandas series of text documents to classify 
    y : pandas.Series object
        Pandas series containing label for each document
    Returns
    --------
    tf_idf_train :  sparse matrix, [n_train_samples, n_features]
        Vector representation of train data
    tf_idf_test :  sparse matrix, [n_test_samples, n_features]
        Vector representation of test data

    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)

    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer

In [114]:
tf_idf_train, tf_idf_test, y_train, y_test, vecotorizer = tfidf(df_messages['v2'], df_messages['target'])

In [115]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [116]:
#create a function that takes in a classifier and trains it on our tf-idf vectors and generates test and train predictiions
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    '''
    Train a classifier to identify whether a message is spam or ham
    
    Parameters
    ----------
    classifier: sklearn classifier
       initialized sklearn classifier (MultinomialNB, RandomForestClassifier, etc.)
    tf_idf_train : sparse matrix, [n_train_samples, n_features]
        TF-IDF vectorization of train data
    tf_idf_test : sparse matrix, [n_test_samples, n_features]
        TF-IDF vectorization of test data
    y_train : pandas.Series object
        Pandas series containing label for each document in the train set
    Returns
    --------
    train_preds :  list object
        Predictions for train data
    test_preds :  list object
        Predictions for test data
    '''
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

In [117]:
#generate predictions for Naive Bayes Classifier
nb_train_preds, nb_test_preds = classify_text(nb_classifier,tf_idf_train, tf_idf_test, y_train)

In [118]:
print(confusion_matrix(y_test, nb_test_preds))
print(accuracy_score(y_test, nb_test_preds))

[[1202    0]
 [  44  147]]
0.968413496051687


In [119]:
#generate predictions for Random Forest Classifier
rf_train_preds, rf_test_preds = classify_text(rf_classifier,tf_idf_train, tf_idf_test, y_train)

In [120]:
print(confusion_matrix(y_test, rf_test_preds))
print(accuracy_score(y_test, rf_test_preds))

[[1202    0]
 [  34  157]]
0.9755922469490309


In [121]:
def get_max_tf_idf(tf_idf, doc):
    '''
    Get word with highest TF-IDF value in a document
    
    Parameters
    ----------
    tf_idf : spare matrix 
        TF-IDF vectorization of text data
    doc : int object
        Index of document in vectorization to get max tf-idf for
    --------
    max_tf_idf : str object
        Word with highest TF-IDF value in a document
    '''
    x = tf_idf[doc].toarray()
    max_tf_idf = vecotorizer.get_feature_names()[np.where(x[0] == max(x[0]))[0][0]]
    print(max_tf_idf)

### Explain
The word schools has the highest TF-IDF value in the second document of our test data. What does that tell us about the word school? 

In [122]:
get_max_tf_idf(tf_idf_test, 1)

schools
