In [1327]:
import numpy as np
import pandas as pd
from data_funs import read_data, thing, prune_feats
from collections import defaultdict, Counter
import re
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
nltk.download('stopwords')
import sklearn

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jamielafarr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Data Retrieval

In [1344]:
data = read_data()
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Partiton features & labels into train / test

In [1329]:
labels, test_l, features, test = data.iloc[:4000, 0], data.iloc[4000:, 0], data.iloc[:4000, 1], data.iloc[4000:, 1]

## Convert into numpy

In [1330]:
labels, features, test, test_l = labels.to_numpy(), features.to_numpy(), test.to_numpy(), test_l.to_numpy()
for i, example in enumerate(labels):
    if example == 'ham':
        labels[i] = 0
    else:
        labels[i] = 1
for i, example in enumerate(test_l):
    if example == 'ham':
        test_l[i] = 0
    else:
        test_l[i] = 1
labels = labels.astype(float)

# 2. Preprocessing

## Remove punctuation, lowercase, and stem features

In [1331]:
ps = PorterStemmer()
for i in range(features.shape[0]):
    features[i] = re.sub(r'[^\w\s]','',features[i])
    features[i] = ps.stem(features[i]).lower()
for i in range(test.shape[0]):
    test[i] = re.sub(r'[^\w\s]','',test[i])
    test[i] = ps.stem(test[i]).lower()

# 3. Compute feature counts for each label

In [1332]:
l = [0, 1] # [ham, spam]
word_counts = defaultdict(lambda: Counter())
for label in l:
    for lab, example in zip(labels, features):
        if lab != label:
            continue
        word_counts[label] += Counter(example.split(' '))

# 4. Convert features to number of positive / negative counts

In [1333]:
feats = torch.zeros((features.shape[0], 2), dtype=float)
test_feats = torch.zeros((test.shape[0], 2), dtype=float)
for i, example in enumerate(features):
    counts = defaultdict(float)
    for word in example.split(' '):
        for label in l:
            counts[label] += word_counts[label][word]
    feats[i][0] = float(counts[0])
    feats[i][1] = float(counts[1])
for i, example in enumerate(test):
    counts = defaultdict(float)
    for word in example.split(' '):
        for label in l:
            counts[label] += word_counts[label][word]
    test_feats[i][0] = float(counts[0])
    test_feats[i][1] = float(counts[1])

# 5. Create logistic regression model

In [1334]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.layer1 = nn.Linear(input_dim, 2)
        self.sigmoid = nn.Sigmoid()
        self.act = nn.Tanh()
        self.layer2 = nn.Linear(2, 1)
        self.layer3 = nn.Linear(1, 1)
        
    def forward(self, x):
        y = self.layer1(x)
        y = self.act(y)
        y = self.layer2(y)
        y = self.act(y)
        y = self.layer3(y)
        return self.sigmoid(y)

# 6. Initialize model & its parameters

In [1335]:
model = LogisticRegression(2, 1)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.005)

# 7. Minimize cost function

In [1336]:
epochs = 10000
for i in range(epochs):
    output = model(feats.float())
    output = output.view(4000, 1)
    loss = loss_fn(output, torch.tensor(labels).view(4000, 1).float())
    if i % 10 == 0 or i == epochs - 1:
        pass
        print(f'epoch #{i}, loss: {loss}, weight: {model.layer1.weight}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

tensor(0.1451, grad_fn=<BinaryCrossEntropyBackward0>)


# Run model against test set

In [1343]:
with torch.no_grad():
    y_pred=model(test_feats.float()) 
    y_pred = y_pred.round()
    accuracy=((y_pred_class == torch.tensor(test_l.astype(float)).view(1572, 1)).sum()) / 1572
    print(f'Accuracy: {round(accuracy.item() * 100, 2)}%')

Accuracy: 94.66%
