Importing data

In [1]:
import pandas as pd
import numpy as np
train_data = np.array(pd.read_csv('train.csv'))
test_data = np.array(pd.read_csv('test.csv'))

Preprocessing data
- tokenisation
- removing un-needed word
- lowering, removing punctutaion
- removing stop words
- stemming (converting word to root word)
- taking out the unique list of words

In [2]:
import nltk
nltk.download("stopwords")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
#removes punctuation from our tokenised dataset
def tokenize(sentence):
  tokenizer = RegexpTokenizer(r'\w+')
  return tokenizer.tokenize(sentence)

# .lowering and .stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemmingText(tokenizedSen):
  stemmedSen = []
  for word in tokenizedSen:
    stemmedSen.append(ps.stem(word.lower()))
  return stemmedSen

In [4]:
def preprocessingData(temp_data):
  # we need a training corpus
  all_word = []
  all_sentences = []
  tags = []

  for sen,tag,x,y in temp_data:
      temp = tokenize(sen) #our tokenised sentence
      if 'URL' in temp: #removing unnecessary words
          temp.remove('URL')
      without_stop_words = []
      for word  in temp:
          if word not in stopwords.words('english'):
              without_stop_words.append(word)
      temp = without_stop_words
      temp2 = stemmingText(temp) #after stemming
      all_word.extend(temp2)#final words are stemmed
      tags.append(tag)
      all_sentences.append(temp2)
          
  all_word = sorted(set(all_word))
  return (all_word,all_sentences,tags)

In [5]:
pre_processed_training_data = preprocessingData(train_data)

Bag of Words

In [6]:
def bagOfWords(tokenizedSentence,allWords):
    tempVector = []   
    for word in allWords:
      if word in tokenizedSentence:
        tempVector.append(1)
      else:
        tempVector.append(0)
    return tempVector

In [7]:
all_word , all_sentences , tags = pre_processed_training_data 

In [8]:
def bag_of_words(all_sentences,all_word):
  final_training_data = []
  for sen in all_sentences:
    temp = bagOfWords(sen,all_word)
    final_training_data.append(np.array(temp))
  return final_training_data

In [9]:
final_training_data = bag_of_words(all_sentences,all_word)

In [29]:
type(final_training_data[0])

numpy.ndarray

Binary Classification of sexist and not sexist</br>
[1 0] - not sexist</br>
[0 1] - sexist

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
final_training_tag = np.array(ct.fit_transform(np.array(tags).reshape(-1,1)))

Dataset and Dataloader class 

In [11]:
import torch
import torchvision
from torch.utils.data import Dataset,DataLoader
import math

In [12]:
class SentimentDataset(Dataset):
  def __init__(self,final_data,final_tag):
    self.x = torch.from_numpy(np.array(final_data))
    self.y = torch.from_numpy(np.array(final_tag))
    self.n_samples = len(self.x)

  def __getitem__(self,index):
    return (self.x[index] , self.y[index])
  
  def __len__(self):
    return self.n_samples

In [13]:
batch_size = 32
train_dataset = SentimentDataset(final_training_data,final_training_tag)
train_dataloader = DataLoader(dataset=train_dataset , batch_size = batch_size, shuffle =True, num_workers=2)

In [30]:
dataiter = iter(train_dataloader)
data = dataiter.next()
features,labels = data
print(features.shape)

torch.Size([32, 11290])


FeedForward

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


In [16]:
#hyper - parameters
input_size = len(final_training_data[0])
hidden_size = 100
num_classes = 2
num_epochs = 100
batch_size = 32
learning_rate = 0.001
total_samples = len(train_dataset)
num_iterations = total_samples/batch_size

In [17]:
import torch.nn as nn

In [18]:
class NeuralNet(nn.Module):
  def __init__(self,input_size,hidden_size,num_classes):
    super(NeuralNet,self).__init__()
    self.l1 = nn.Linear(input_size,hidden_size)
    self.relu = nn.ReLU()
    self.l2 = nn.Linear(hidden_size,num_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,x):
    out = self.l1(x)
    out = self.relu(out)
    out = self.l2(out)
    out = self.softmax(out)
    return out

In [19]:
model = NeuralNet(input_size,hidden_size,num_classes)

In [20]:
#loss and optimizer
criterion = nn.CrossEntropyLoss()
#finally CrossEntropy loss applies the softmax function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Training

In [21]:
for epoch in range(num_epochs):
  for i, (inputs, labels) in enumerate(train_dataloader):
    #forward pass
    outputs = model(inputs.float())
    # print(outputs)
    loss = criterion(outputs,labels.float())

    if (i+1)/250 >= 1:
      outputs = (outputs>0.5).float()
      correct = (outputs == labels.float()).float()
      sum = 0
      for List in correct:
        if 0 not in List:
          sum+=1
      print(f'epoch {epoch+1}/{num_epochs} loss = {loss.item():.4f} accuracy = {(sum/outputs.shape[0])*100} %')
      
    #backward pass
    optimizer.zero_grad()
    loss.backward() #back propgation
    optimizer.step() #update weights for us

epoch 1/100 loss = 0.5810 accuracy = 71.875 %
epoch 2/100 loss = 0.5052 accuracy = 81.25 %
epoch 3/100 loss = 0.3764 accuracy = 93.75 %
epoch 4/100 loss = 0.4093 accuracy = 90.625 %
epoch 5/100 loss = 0.3913 accuracy = 93.75 %
epoch 6/100 loss = 0.3186 accuracy = 100.0 %
epoch 7/100 loss = 0.4136 accuracy = 90.625 %
epoch 8/100 loss = 0.3514 accuracy = 96.875 %
epoch 9/100 loss = 0.3168 accuracy = 100.0 %
epoch 10/100 loss = 0.3473 accuracy = 96.875 %
epoch 11/100 loss = 0.3760 accuracy = 93.75 %
epoch 12/100 loss = 0.3448 accuracy = 96.875 %
epoch 13/100 loss = 0.3156 accuracy = 100.0 %
epoch 14/100 loss = 0.3449 accuracy = 96.875 %
epoch 15/100 loss = 0.3449 accuracy = 96.875 %
epoch 16/100 loss = 0.3139 accuracy = 100.0 %
epoch 17/100 loss = 0.3136 accuracy = 100.0 %
epoch 18/100 loss = 0.3135 accuracy = 100.0 %
epoch 19/100 loss = 0.3134 accuracy = 100.0 %
epoch 20/100 loss = 0.3447 accuracy = 96.875 %
epoch 21/100 loss = 0.3434 accuracy = 96.875 %
epoch 22/100 loss = 0.3149 accura

Testing

In [22]:
T,test_all_sentences,test_tags = preprocessingData(test_data)

In [23]:
final_testing_data = bag_of_words(test_all_sentences,all_word)
final_testing_tags = np.array(ct.transform(np.array(test_tags).reshape(-1,1))) 

In [24]:
test_dataset = SentimentDataset(final_testing_data,final_testing_tags)
test_dataloader = DataLoader(dataset=test_dataset , batch_size = batch_size, shuffle =True, num_workers=2)

In [25]:
dataiter = iter(test_dataloader)
data = dataiter.next()
features,labels = data

In [26]:
sum = 0
for (features ,labels) in test_dataloader:
  predictions = model(features.float())
  predictions = (predictions>0.5).float()
  correct = (predictions == labels.float()).float()
  for List in correct:
        if 0 not in List:
          sum+=1
print(f'testing accuracy {(sum/len(test_dataset))*100}')

testing accuracy 77.25
