# Data Preparation for NLP


## 1. Bag of Words (BoW) method

In [1]:
import pandas as pd
import numpy as np

# we'll use this for the spam messages
from nltk.tokenize import TweetTokenizer
from nltk import ngrams
from nltk.corpus import stopwords

data_dir = "../nlp_datasets/00_nlp_basics/"

In [2]:
df = pd.read_table(data_dir+"SMSSpamCollection.txt")
df['coding'] = 0
df.loc[df.outcome == 'spam','coding'] = 1

df_train = df.sample(frac=.8,random_state = 123).copy()
df_test = df.drop(df_train.index).copy()

df_train.head(2)

Unnamed: 0,outcome,text,coding
3237,ham,Aight text me when you're back at mu and I'll ...,0
843,ham,Our Prashanthettan's mother passed away last n...,0


In [3]:
import re

STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower() # lowercase text
    text = re.sub(r'https?://\S+', ' ', text)# Replace URLs with a space
    text = re.sub(r'\$\S+', ' ', text)       # Replace ticker symbols with a space
    text = re.sub(r'\@\S+', ' ', text)       # Replace StockTwits usernames with a space.
    text = re.sub(r'[^a-zA-Z]', ' ', text)   # Replace everything not a letter with a space        
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwors from text
    return text

df_train['text_processed'] = df_train['text'].apply(text_prepare)
df_test['text_processed'] = df_test['text'].apply(text_prepare)

df_train.head(2)

Unnamed: 0,outcome,text,coding,text_processed
3237,ham,Aight text me when you're back at mu and I'll ...,0,aight text back mu swing need somebody get door
843,ham,Our Prashanthettan's mother passed away last n...,0,prashanthettan mother passed away last night p...


In [4]:
from collections import Counter

ALL_WORDS = [word for message in df_train['text_processed'] for word in message.split()]
words_counts = Counter(ALL_WORDS)
words_counts_sorted = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)

DICT_SIZE = 1000
VOCAB = words_counts.most_common(DICT_SIZE)  # already sorted
WORDS_TO_INDEX = {item[0]:ii for ii, item in enumerate(VOCAB) }
INDEX_TO_WORDS = {ii:word for word, ii in WORDS_TO_INDEX.items()}    

In [5]:
from scipy import sparse as sp_sparse

def bag_of_words(text, words_to_index, dict_size):
    result_vec = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vec[words_to_index[word]] +=1
    return result_vec

def prepare_X(df, col="text_processed"):
    X_bow = sp_sparse.vstack([
        sp_sparse.csr_matrix(bag_of_words(text,WORDS_TO_INDEX, DICT_SIZE))
        for text in df[col]])
    return X_bow

X_train_bow = prepare_X(df_train)
X_test_bow = prepare_X(df_test)

print ("Bow Method")
print('X_train shape ', X_train_bow.shape)
print('X_test shape ', X_test_bow.shape)

Bow Method
X_train shape  (4458, 1000)
X_test shape  (1114, 1000)


In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_bow, df_train['coding'])

LogisticRegression()

In [7]:
y_train_pred = model.predict(X_train_bow)

In [8]:
from sklearn.metrics import accuracy_score
acc_train=accuracy_score(df_train['coding'], model.predict(X_train_bow) )
acc_test =accuracy_score(df_test['coding'], model.predict(X_test_bow) )

print (f"Training accuracy: %1.2f \nTest accuracy: %1.2f" %(100*acc_train, 100*acc_test))

Training accuracy: 98.99 
Test accuracy: 98.29
