# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import random
import tqdm

# Importing the dataset

In [None]:
dataset = pd.read_csv('../input/sms_spam.csv')

In [237]:
dataset.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Encoding outputs

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
types = label_encoder.fit_transform(dataset['type'])

# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(dataset['text'], types, test_size = 0.1, random_state = 1 )

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state = 1)

In [239]:
len(train_x), len(test_x), len(val_x)

(4062, 558, 452)

# Defining Model

In [None]:
class SpamClassifier(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, batch_size):
        super(SpamClassifier, self).__init__()
        
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.linear = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, embeddings, hidden):
        output, hidden = self.lstm(embeddings.view(len(inputs), self.batch_size, -1))
        output = self.linear(output)
        output = self.sigmoid(output)
        
        return output, hidden
        
    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_size), torch.zeros(1, self.batch_size, self.hidden_size))
        

# Preparing data

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
processed_text = []
for course in dataset['text']:
    doc = nlp(course.lower())
    words = [word.lemma_ for word in doc if not word.is_punct | word.is_space | word.is_stop]
    processed_text.append(words)

In [240]:
print(len(processed_text))
for i in range(5):
    print(processed_text[i])


5574
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'get', 'amore', 'wat']
['ok', 'lar', 'joke', 'wif', 'u', 'oni']
['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', '2005', 'text', 'fa', '87121', 'receive', 'entry', 'question(std', 'txt', 'rate)t&c', 'apply', '08452810075over18']
['u', 'dun', 'early', 'hor', 'u', 'c']
['nah', 'think', 'go', 'usf', 'live']


#### Creating sequences

In [None]:
def seq(sent):
    inputs = []
    for word in sent:
        inputs.append(list(nlp(word).vector))
        
    return inputs

# Training

In [None]:
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 100
OUTPUT_SIZE = 1
BATCH_SIZE = 1

model = SpamClassifier(EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, BATCH_SIZE)

In [241]:
model

SpamClassifier(
  (lstm): LSTM(300, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
lr = 1e-3
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr)

In [None]:
with torch.no_grad():
    inputs = seq(processed_text[0])
    inputs = torch.tensor(inputs, dtype=torch.float)
    hidden = model.init_hidden()
    outputs, hidden = model(inputs,hidden)
    print(outputs[-1])

In [None]:
train_loss = []
val_loss = []

In [None]:
EPOCHS = 1
hidden = model.init_hidden()

for epoch in range(EPOCHS):
    for i in range(len(train_x)):
        inputs = seq(processed_text[i])
        if len(inputs) >= 1:
            inputs = torch.tensor(inputs, dtype=torch.float)       
            target = types[i]
            target = torch.tensor(target, dtype=torch.float)
            outputs, hidden = model(inputs, hidden)
            loss = loss_fn(outputs[-1].squeeze(), target)
            train_loss.append(loss.item())
            model.zero_grad()
            loss.backward()
            optimizer.step()
    print(loss.item())
    for i in range()


In [None]:
y_pred = []
y_actual = []
ids = []
test_processed = processed_text[-len(test_x):]
test_types = types[-len(test_x):]
for i in range(500):
    inputs = seq(test_processed[i])
    inputs = torch.tensor(inputs, dtype=torch.float)
    targets = test_types[i]
    targets = torch.tensor(targets, dtype=torch.float)
    outputs, h = model(inputs, hidden)
    y_pred.append(outputs[-1].squeeze().item())
    y_actual.append(targets)
    

In [None]:
y_pred = [1 if i>=0.5 else 0 for i in y_pred]

In [236]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred, y_actual)

array([[433,   8],
       [  2,  57]])