In [2]:
from transformers import pipeline
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

from transformers import pipeline
import pandas as pd
import boto3
import numpy as np
from collections import Counter
import os
import random

import re, string
from typing import Dict

import torch, torchtext
import torchvision.models as models
from torchtext.data.utils import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torchtext.vocab import GloVe
from torch import nn, optim
from torch.nn import Module, Embedding, LSTM, RNN, GRU, Linear, Sequential, Dropout
from torch.nn.functional import sigmoid, relu, elu, tanh
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.nn.utils.rnn import PackedSequence

from transformers import AutoTokenizer

from tqdm import tqdm

SEED = 1234
N_SAMPLES = 10_000

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score

import time

import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Today I'm going to use {device.type}")

Today I'm going to use cuda


In [5]:
classifier = pipeline("sentiment-analysis")
classifier("We are very happy to show you the 🤗 Transformers library.")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [6]:
chunks = pd.read_csv("../data/toxic_data.csv", chunksize=100000)
df = pd.concat(chunks)
df.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat,male,female,transgender,other_gender,heterosexual,homosexual_gay_or_lesbian,bisexual,other_sexual_orientation,christian,jewish,muslim,hindu,buddhist,atheist,other_religion,black,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till after the election in 2 yrs.... dirty politicians need to be afraid of Tar and feathers again... but they aren't and so the people get screwed.,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,0,2,0,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925,,,,,,,,,,,,,,,,,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental hospitals. Boorah,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,1,2,0,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789,,,,,,,,,,,,,,,,,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by not making this announcement himself.\n\nWhat an awful human being .....,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,2,3,7,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\n\nIf you cop-suckers can't see a problem with this, then go suck the barrel of a Glock.",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,0,0,0,0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263,,,,,,,,,,,,,,,,,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,0,1,0,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0,80


In [7]:
df['comment_text'] = df['comment_text'].fillna("")
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['toxicity']:
    df.loc[:, col] = np.where(df[col] >= 0.5, True, False)

In [8]:
train_df = df[df['split'] == 'train']
test_df = df[df['split'] != 'train']

In [9]:
def preprocess(df):
    cols_to_remove = [col for col in df.columns if col not in ['comment_text', 'toxicity']]
    dfs = Dataset.from_pandas(df)
    dfs = dfs.remove_columns(cols_to_remove)
    dfs = dfs.rename_column("toxicity", "labels")
    dfs = dfs.remove_columns('__index_level_0__')
    
    return dfs

In [10]:
# sample = train_df.sample(N_SAMPLES, random_state=SEED, ignore_index=True)
train_text, val_text  = train_test_split(train_df, test_size=0.2, random_state=SEED)
# train_text, train_label = sample['comment_text'], sample['toxicity']
test_text = test_df
train_text.shape, val_text.shape, test_text.shape

((1443900, 46), (360975, 46), (194641, 46))

In [11]:
train_text = preprocess(train_text)
val_text = preprocess(val_text)
test_text = preprocess(test_text)

In [17]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True)

In [None]:
%%time

train_tokenized = train_text.map(tokenize_function, batched=True)
val_tokenized = val_text.map(tokenize_function, batched=True)
test_tokenized = test_text.map(tokenize_function, batched=True)

  0%|          | 0/1444 [00:00<?, ?ba/s]

In [19]:
train_tokenized

NameError: name 'train_tokenized' is not defined

In [13]:
train_tokenized = train_tokenized.remove_columns('comment_text').set_format("torch")
val_tokenized = val_tokenized.remove_columns('comment_text').set_format("torch")
test_tokenized = test_tokenized.remove_columns('comment_text').set_format("torch")

NameError: name 'train_tokenized' is not defined

In [None]:
small_train_dataset = train_tokenized.shuffle(seed=SEED).select(range(N_SAMPLES))
small_eval_dataset = val_tokenized.shuffle(seed=SEED).select(range(N_SAMPLES))

In [None]:
torch.cuda.empty_cache()

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
val_dataloader = DataLoader(small_eval_dataset, batch_size=16)