In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import pandas as pd
import torch.utils.data as data_utils

In [2]:
df = pd.read_csv('spam.csv',encoding='latin-1')
df = df.dropna(how = "any", axis = 1)
df.columns = ['v1','v2']
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['Spam']=pd.get_dummies(df['v1'], drop_first=True)
df=df.drop('v1', axis=1)
df.head()

Unnamed: 0,v2,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
import re
import string
from string import punctuation
from nltk.corpus import stopwords
def text_processing(message):
    
    Stopwords = stopwords.words('english')
    # Check characters to see if they are in punctuation
    no_punctuation = [char for char in message if char not in string.punctuation]# Join the characters again to form the string
    no_punctuation = ''.join(no_punctuation)
    
    # Now just remove any stopwords
    return ' '.join([word for word in no_punctuation.split() if word.lower() not in Stopwords])
df['text'] = df['v2'].apply(text_processing)
df.head()

Unnamed: 0,v2,Spam,text
0,"Go until jurong point, crazy.. Available only ...",0,Go jurong point crazy Available bugis n great ...
1,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,U dun say so early hor... U c already then say...,0,U dun say early hor U c already say
4,"Nah I don't think he goes to usf, he lives aro...",0,Nah dont think goes usf lives around though


In [5]:
from sklearn.model_selection import train_test_split
X=df['text'].values
y=df['Spam'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#Vectorization
bow = CountVectorizer()
X_train = bow.fit_transform(X_train)
X_test = bow.transform(X_test)
#Term Frequency, Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
X_train=X_train.toarray()
X_test=X_test.toarray()

In [7]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
X_train=torch.tensor(X_train)
y_train=torch.tensor(y_train)
train = data_utils.TensorDataset(X_train, y_train)
train_dataloader = data_utils.DataLoader(train, batch_size=50, shuffle=True)

In [9]:
X_test=torch.tensor(X_test)
y_test=torch.tensor(y_test)
test = data_utils.TensorDataset(X_test, y_test)
test_dataloader = data_utils.DataLoader(test, batch_size=50, shuffle=True)

In [10]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
        nn.Linear(8270,4000),
        nn.ReLU(),
        nn.Linear(4000,1000),
        nn.ReLU(),
        nn.Linear(1000,400),
        nn.ReLU(),
        nn.Linear(400,1),
        nn.Sigmoid()
        )
    def forward(self,x):
        
        logits = self.linear_relu_stack(x)
        return logits
model = NeuralNetwork()
print(model)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=8270, out_features=4000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4000, out_features=1000, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1000, out_features=400, bias=True)
    (5): ReLU()
    (6): Linear(in_features=400, out_features=1, bias=True)
    (7): Sigmoid()
  )
)


In [11]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)

In [12]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
       

        # Compute prediction error
        pred = model(X.float())
        loss = loss_fn(pred, y.float())

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [13]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            
            pred = model(X.float())
            test_loss += loss_fn(pred, y.float()).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [14]:
epochs = 40
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)
print("Done!")

Epoch 1
-------------------------------


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


loss: 0.699405  [    0/ 4457]


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Test Error: 
 Accuracy: 86.5%, Avg loss: 0.002280 

Epoch 2
-------------------------------
loss: 0.053750  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.003364 

Epoch 3
-------------------------------
loss: 0.001106  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.003164 

Epoch 4
-------------------------------
loss: 0.001083  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.004533 

Epoch 5
-------------------------------
loss: 0.000005  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.004795 

Epoch 6
-------------------------------
loss: 0.000003  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.005838 

Epoch 7
-------------------------------
loss: 0.000000  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.006149 

Epoch 8
-------------------------------
loss: 0.000000  [    0/ 4457]
Test Error: 
 Accuracy: 86.5%, Avg loss: 0.006512 

Epoch 9
-------------------------------
loss: 0.000000  [    0/ 4457]


KeyboardInterrupt: 

In [15]:
prediction = model(X_test.float())
predictions = []
for p in prediction:
    if p<=0.5:
        predictions.append(0)
    else:
        predictions.append(1)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test.detach().numpy(),predictions))
print(classification_report(y_test,predictions))

[[962   3]
 [ 19 131]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

