# 1. Import libraries and dataset

## Import dataset from google drive

In [5]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1t4GfxnNQr4BDD8Ob6TpuhvkiInLn2iaW'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('FinalBalancedDataset.csv')

## Import libraries

In [None]:
!pip install torchtext
!pip install pytorch-nlp

In [86]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [74]:
import re
import nltk
from nltk.stem.porter import *
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor


In [75]:
df = pd.read_csv('FinalBalancedDataset.csv')

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Data pre-processing

## Data information

In [9]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56745 entries, 0 to 56744
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  56745 non-null  int64 
 1   Toxicity    56745 non-null  int64 
 2   tweet       56745 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


## Data pre-processing

Seems like this dataset doesnt need to do anything...

In [76]:
df.drop('Unnamed: 0', axis=1, inplace = True)

In [77]:
df['Clean'] = df.apply(lambda row: row['tweet'].lower(),axis=1)
df['Clean'] = df.apply(lambda row: re.sub("@[A-Za-z0-9_]+","", row['Clean']),axis=1) #remove @user
df['Clean'] = df.apply(lambda row: re.sub("#[A-Za-z0-9_]+","", row['Clean']),axis=1) #remove hashtags
df['Clean'] = df.apply(lambda row: re.sub('[()!?]'," ", row['Clean']),axis=1)
df['Clean'] = df.apply(lambda row: re.sub('\[.*?\]'," ", row['Clean']),axis=1)
df['Clean'] = df.apply(lambda row: re.sub("[^a-z0-9]"," ", row['Clean']),axis=1)

In [78]:
df.head(20)

Unnamed: 0,Toxicity,tweet,Clean
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can t use cause they do...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur ...
4,0,factsguide: society now #motivation,factsguide society now
5,0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before they le...
6,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,0,the next school year is the year for exams.ð...,the next school year is the year for exams ...
8,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land
9,0,@user @user welcome here ! i'm it's so #gr...,welcome here i m it s so


In [79]:
def words_to_int(words,all_words):
    new_words = []
    for word in words:
        new_words.append(word)
    list_of_os = np.zeros(len(all_words))
    for i in range(len(all_words)):
        if all_words[i] in new_words:
            list_of_os[i] = 1.0
    return list_of_os

In [87]:
all_words = []
tags = []

stemmer = PorterStemmer()

for x_iter,y_iter in tqdm(zip(df['Clean'],df['Toxicity'])):
    x_iter = nltk.word_tokenize(x_iter)
    new_x_iter = []
    for x_iter_i in x_iter:
        new_x_iter.append(stemmer.stem(x_iter_i))
    all_words.extend(new_x_iter)
    tags.append(y_iter)

56745it [00:18, 3078.64it/s]


In [88]:
all_words[:10]

['when', 'a', 'father', 'is', 'dysfunct', 'and', 'is', 'so', 'selfish', 'he']

In [None]:
new_X = []
new_y = []

for X_iter,y_iter in tqdm(zip(df['Clean'],df['Toxicity'])):
    new_X.append(words_to_int(X_iter,all_words))
    new_y.append(tags.index(y_iter))

In [None]:
new_X[:10]

In [64]:
x_train, x_test, y_train, y_test = train_test_split(np.array(new_X), np.array(new_y), test_size = .3, 
                                                    shuffle=True, random_state = 42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((39721,), (39721,), (17024,), (17024,))

In [65]:
X_train = torch.from_numpy(x_train).to(device)
y_train = torch.from_numpy(y_train).to(device)
X_test = torch.from_numpy(x_test).to(device)
y_test = torch.from_numpy(y_test).to(device)

TypeError: ignored

## RNN

In [26]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        
        # RNN
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        
        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        out, hn = self.rnn(x, h0)
        out = self.fc(out[:, -1, :]) 
        return out


In [28]:
model = RNNModel()

TypeError: ignored

In [None]:
batch_size = 100
n_iters = 8000
num_epochs = int(n_iters / (len(x_train) / batch_size))

