## Natural Language Processing

In this exercise we will attempt to classify text messages as "SPAM" or "HAM" using TF-IDF Vectorization

In [150]:
#import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [151]:
#read in data
df_messages = pd.read_csv('spam.csv', usecols=[0,1])

In [152]:
#convert string labels to 1 or 0 
le = LabelEncoder()
df_messages['target'] = le.fit_transform(df_messages['v1'])

In [45]:
#create a function that takes in a string and returns a list of tokens or words 
def tokenize(string):
    '''
    Tokenizes a string
    
    Parameters
    ----------
    string: str object
        String object to tokenize
    Returns
    --------
    tokens : list
        A list containing each word in string as an element 

    '''
    
    tokens = word_tokenize(string)
    return tokens  

In [None]:
#create a function to remove stopwords from a list of tokens
def remove_stopwords(tokens): 
    '''
    Removes stopwords from a list of tokens (words)
    
    Parameters
    ----------
    tokens: list object
        List of tokens that need stopwords removed
    Returns
    --------
    stopwords_removed : list object
        A list containing tokens with stopwords removed

    '''
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    stopwords_removed = [token.lower().replace('�','') for token in tokens if token not in stopwords_list]
    return stopwords_removed

In [46]:
#apply tokenization and stop word removal to our dataset 
df_messages['tokens'] = df_messages['v2'].apply(lambda x: tokenize(x))
df_messages['stopwords_removed'] = df_messages['tokens'].apply(lambda x: remove_stopwords(x))

In [131]:
#create a function that outputs the frequency of the n most common words
def frequency_distribution(tokens, n):
    '''
    Get n most common words in a Series of tokens
    
    Parameters
    ----------
    tokens: pandas.Series object
        Pandas series of tokens 
    n : int object
        Integer defining the number of most common words to return
    Returns
    --------
    most_common : list object
        An array of tuples containing word frequency for n most common words

    '''
    messages_concat = []
    for message in tokens:
        messages_concat += message
    messages_freqdist = FreqDist(messages_concat)
    most_common = messages_freqdist.most_common(n)
    return most_common

In [149]:
frequency_distribution(df_messages['stopwords_removed'], 10)

[('i', 1952),
 ('...', 1233),
 ('u', 1118),
 ('call', 576),
 ("'s", 492),
 ('2', 485),
 ("'m", 395),
 ('get', 385),
 ('ur', 381),
 ("n't", 361)]

In [133]:
#generate tf-idf vectorization for our data (split data into train and test here)
def tfidf(X, y): 
    '''
    Generate train and test TF-IDF vectorization for our data set
    
    Parameters
    ----------
    X: pandas.Series object
        Pandas series of text documents to classify 
    y : pandas.Series object
        Pandas series containing label for each document
    Returns
    --------
    tf_idf_train :  sparse matrix, [n_train_samples, n_features]
        Vector representation of train data
    tf_idf_test :  sparse matrix, [n_test_samples, n_features]
        Vector representation of test data

    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)

    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test

In [135]:
tf_idf_train, tf_idf_test = tfidf(df_messages['v2'], df_messages['target'])

In [136]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [138]:
#create a function that takes in a classifier and trains it on our tf-idf vectors and generates test and train predictiions
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

In [139]:
nb_train_preds, nb_test_preds = classify_text(nb_classifier,tf_idf_train, tf_idf_test, y_train)

In [144]:
print(confusion_matrix(nb_test_preds, y_test))
print(accuracy_score(nb_test_preds, y_test))

[[1202   44]
 [   0  147]]
0.968413496051687


In [142]:
rf_train_preds, rf_test_preds = classify_text(rf_classifier,tf_idf_train, tf_idf_test, y_train)

In [145]:
print(confusion_matrix(rf_test_preds, y_test))
print(accuracy_score(rf_test_preds, y_test))

[[1202   34]
 [   0  157]]
0.9755922469490309
