In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, TensorDataset
import nltk   
nltk.download('punkt') 
nltk.download('stopwords')
from nltk.corpus import stopwords       
from nltk.stem import PorterStemmer        
import string
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Preprocessing

In [None]:
df = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header=None)
df2 = pd.read_csv("yelp_labelled.txt", sep="\t", header=None)
df2.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
sentences = list(df[0])
sentences2 = list(df2[0])
labels = np.array(df[1])
labels2 = np.array(df2[1])

sentences = sentences + sentences2
labels = np.hstack((labels, labels2))

In [None]:
print(sentences[0], labels[0])

So there is no way for me to plug it in here in the US unless I go by a converter. tensor([1.])


In [None]:
words = nltk.word_tokenize(sentences[6])
words= [word.lower() for word in words if word.isalpha()]
words

['if',
 'you',
 'have',
 'several',
 'dozen',
 'or',
 'several',
 'hundred',
 'contacts',
 'then',
 'imagine',
 'the',
 'fun',
 'of',
 'sending',
 'each',
 'of',
 'them',
 'one',
 'by',
 'one']

In [None]:
stopwords_english = stopwords 
print(stopwords_english)

punc = string.punctuation
print(punc)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
clean_sample = []

for word in tokens:
  if word not in stopwords_english and word not in punc:
    clean_sample.append(word)

print(clean_sample)

['way', 'plug', 'us', 'unless', 'go', 'converter']


In [None]:
stemmer = PorterStemmer()

stems = []
for word in clean_sample:
  stem = stemmer.stem(word)
  stems.append(stem)

stems

['way', 'plug', 'us', 'unless', 'go', 'convert']

In [None]:
def process_sentence(sentence):
  words = nltk.word_tokenize(sentence)
  words = [word.lower() for word in words if word.isalpha()]

  stopwords_english = stopwords.words() 

  clean_sample = []

  for word in words:
    if word not in stopwords_english:
      clean_sample.append(word)

  stemmer = PorterStemmer()
  stems = []
  for word in clean_sample:
    stem = stemmer.stem(word)
    stems.append(stem)

  return stems

print(sentences[6])
print(process_sentence(sentences[6]))

If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.
['sever', 'dozen', 'sever', 'hundr', 'contact', 'imagin', 'fun', 'send']


In [None]:
def count_words(sentences):
  freqs = {}
  for i, sentence in enumerate(sentences):
    processed_sentence = process_sentence(sentence)

    for word in processed_sentence:
      if (word, labels[i]) in freqs.keys():
        freqs[(word, labels[i])] += 1
      else:
        freqs[(word, labels[i])] = 1
  return freqs

freqs = count_words(sentences)

In [None]:
def count_freqs(sentences, freqs):
  X = np.zeros((len(sentences), 2))

  for i, sentence in enumerate(sentences):
    processed_sentence = process_sentence(sentence)

    for word in processed_sentence:
      X[i, 0] += freqs[(word, 0)] if (word, 0) in freqs.keys() else 0
      X[i, 1] += freqs[(word, 1)] if (word, 1) in freqs.keys() else 0
  
  return X
  
X = count_freqs(sentences, freqs)
Y = labels.reshape(-1, 1)

In [None]:
data = np.concatenate((X,Y), axis=1)
data.shape

(2000, 2) (2000, 1)


(2000, 3)

## Logistic Regression

In [None]:
training_ratio = 0.8
training_size = int(data.shape[0]*training_ratio)

train_X = torch.Tensor(data[:training_size, :2])
train_Y = torch.Tensor(data[:training_size, 2])
test_X = torch.Tensor(data[training_size:, :2])
test_Y = torch.Tensor(data[training_size:, 2])

train_set = TensorDataset(train_X, train_Y)
test_set = TensorDataset(test_X, test_Y)

train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
test_loader = DataLoader(test_set)

In [None]:
class LogisticRegression(nn.Module):
  def __init__(self, dim):
    super(LogisticRegression, self).__init__()
    self.fc1 = nn.Linear(dim, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.fc1(x)
    x = self.sigmoid(x)
    return x

model = LogisticRegression(2)
model(torch.Tensor([[5,5]]))

tensor([[0.3168]], grad_fn=<SigmoidBackward>)

In [None]:
num_epoch = 500
losses = []
accuracies = []
learning_rate = 0.00001
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()

for epoch in range(num_epoch):
  train_loss = 0.0
  for batch, sample in enumerate(train_loader):
    features, labels = sample
    labels = labels.unsqueeze(1)
    
    optimizer.zero_grad()

    outputs = model(features)
    
    loss = criterion(outputs, labels)
    
    loss.backward()
    
    optimizer.step()

    train_loss += loss.item()

  losses.append(train_loss)

  if epoch%10 == 0:
    print("Epoch: ", epoch, "\t Loss: ", train_loss/len(train_loader))

Epoch:  0 	 Loss:  27.095404286384582
Epoch:  10 	 Loss:  23.403183987736703
Epoch:  20 	 Loss:  17.22718613266945
Epoch:  30 	 Loss:  11.244405031204224
Epoch:  40 	 Loss:  0.5010147988796234
Epoch:  50 	 Loss:  0.4849327639490366
Epoch:  60 	 Loss:  0.48357997395098207
Epoch:  70 	 Loss:  0.48300232164561746
Epoch:  80 	 Loss:  0.48290435910224916
Epoch:  90 	 Loss:  0.48301192104816437
Epoch:  100 	 Loss:  0.48280226476490495
Epoch:  110 	 Loss:  0.4824036371707916
Epoch:  120 	 Loss:  0.48308413103222847
Epoch:  130 	 Loss:  0.4828398561477661
Epoch:  140 	 Loss:  0.4829742429405451
Epoch:  150 	 Loss:  0.4828833883628249
Epoch:  160 	 Loss:  0.4830832676962018
Epoch:  170 	 Loss:  0.4823551890999079
Epoch:  180 	 Loss:  0.48314685840159655
Epoch:  190 	 Loss:  0.4826016634702682
Epoch:  200 	 Loss:  0.4815862651541829
Epoch:  210 	 Loss:  0.4826547375693917
Epoch:  220 	 Loss:  0.48263112440705297
Epoch:  230 	 Loss:  0.4828855089098215
Epoch:  240 	 Loss:  0.4827847657725215
Epoc

In [None]:
torch.save(model.state_dict(), "./LogisticReg1.pth")

In [None]:
test_loss = 0.0
accuracy = 0.0
for sample in test_loader:
  feature, label = sample

  label = label.unsqueeze(1)
  output = model(feature)

  loss = criterion(output, label)
  test_loss += loss.item()

  output = (output >= 0.5)*1.0

  accuracy += (output == label)*1.0  

accuracy = accuracy.detach().numpy().item()
print("Loss: ", test_loss/len(test_loader), "\tAccuracy: ", (accuracy/len(test_loader))*100, "%")

Loss:  0.47895539552283944 	Accuracy:  81.0


In [None]:
def predict(sentence):
  sent = [sentence]
  input_features = count_freqs(sent, freqs)
  input_tensor = torch.Tensor(input_features)
  output = model(input_tensor)

  prediction = (output >= 0.5)*1.0

  if not prediction:
    output = 1-output

  return output.item(), prediction.item()

prob, pred = predict("This product is great however it is a little overpriced")

if not pred:
  print("Predicted negative with probability: ", prob)
else:
  print("Predicted positive with probability: ", prob)


Predicted positive with probability:  0.997711181640625
