# Check Memory in colab

In [0]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
!pip install transformers

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7413 sha256=b3e7f75c8e534994b7abb8903c26156c052554476784e46b77c23512a2d08ae5
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/37/ba/dda44bbf35b071441635708a3dd568a5ca6bf29f77389f7c7c6818ae9498/transformers-2.7.0-py3-none-any.whl (544kB)
[K     |████████████████████████████████| 552kB 3.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b

In [0]:
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()

# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

printm()

Gen RAM Free: 26.3 GB  | Proc size: 158.4 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total 16280MB


# Importations



In [0]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule, XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig
import sys
from sklearn import metrics, model_selection
from sklearn.model_selection import train_test_split


# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import XLNetModel, BertTokenizer, BertConfig
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from transformers import XLNetModel, XLNetTokenizer, XLNetConfig
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config


# Datasets (I hope that this is done ) 



- BERT: [CLS] + tokens + [SEP] + padding

- DistilBERT: [CLS] + tokens + [SEP] + padding

- RoBERTa: [CLS] + prefix_space + tokens + [SEP] + padding

- XLM: [CLS] + tokens + [SEP] + padding

- XLNet: padding + tokens + [SEP] + [CLS]

In [0]:
class Dataset_class() : 
  def __init__ (self, comment_text ,tokenizer ,MAX_Len ,targets=None ,train=True ,index=None ,model_name='bert') : 
    self.model_type = model_name
    self.comment_text = comment_text 
    self.tokenizer = tokenizer 
    self.max_length = MAX_Len  
    self.targets = targets 
    self.index = index 
    self.train = train
  def __len__(self) :

    return len(self.comment_text)

  def __getitem__(self,item) : 

    comment_text = str(self.comment_text[item])
    comment_text = " ".join(comment_text.split())

    CLS = self.tokenizer.cls_token
    SEP = self.tokenizer.sep_token

    if self.model_type in ['roberta']:

      tokens = self.tokenizer.tokenize(comment_text, add_prefix_space=True)[:self.max_length - 2]
      tokens = [CLS] + tokens + [SEP]

    else:

      tokens = self.tokenizer.tokenize(comment_text)[:self.max_length - 2]

      if self.model_type in ['xlnet']:
          tokens = tokens + [SEP] +  [CLS]

      else:
          tokens = [CLS] + tokens + [SEP]

    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(tokens)
    input_mask = [1] * len(input_ids)
    padding = [0] * (self.max_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding
    if self.train == True : 
      return {
                'ids': torch.tensor(input_ids, dtype=torch.long),
                'mask': torch.tensor(input_mask, dtype=torch.long),
                'seg_id': torch.tensor(segment_ids, dtype=torch.long),
                'targets' : torch.tensor(self.targets[item], dtype=torch.float)            }
    else: 
      return {
                'ids': torch.tensor(input_ids, dtype=torch.long),
                'mask': torch.tensor(input_mask, dtype=torch.long),
                'seg_id': torch.tensor(segment_ids, dtype=torch.long),
                'id' : self.index[item]       }
            


In [0]:
train = pd.read_csv('/content/drive/My Drive/comment/train_clean1.csv')
train.head()

Unnamed: 0,comment_text,toxic
0,ExplanationWhy the edits made under my usernam...,0
1,D'aww ! He matches this background colour I am...,0
2,"Hey man , I am really not trying to edit war ....",0
3,`` MoreI can not make any real suggestions on ...,0
4,"You , sir , are my hero . Any chance you remem...",0


In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained('/content/drive/My Drive/models') 
data_set = Dataset_class(train.comment_text,tokenizer,128,train.toxic,model_name='roberta')

In [0]:
data_set[0]

{'ids': tensor([  101, 72997, 10650, 17427, 10103, 25635, 10107, 11050, 10533, 11153,
         24934, 23993, 34023, 46671, 10923, 10342, 58831, 50944,   136, 10578,
         10342, 10497, 91299, 30164, 12932,   117, 12125, 61091, 10125, 10970,
         14524, 10515,   151, 33759, 10160, 10246, 10560, 62532, 72010,   119,
         10110, 38881, 10154, 10497, 48107, 10103, 79947, 20849, 10195, 10103,
         20220, 13524, 11500,   151, 10345, 18162, 11628,   119, 12844,   119,
         20426,   119, 11330,   119, 10377,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [0]:
class BertDatasetTest:
    def __init__(self,df):
        self.comment_text = df.comment_text
        self.tokenizer = tokenizer
        self.max_length = Max_len
        self.id = df['id'].values

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'id': self.id[item]
        }

# Models Class


In [0]:
class XLMRClassification(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear_1 = nn.Linear(768, 768)
    self.classification_head = nn.Linear(768, 1)
    self.model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
    self.dropout = nn.Dropout(0.3)
    # initializing classification head
    self.classification_head.weight.data.normal_(mean=0.0, std=0.04)
  def forward(self, inputs_ids ):
    transformer_out, _ = self.model(inputs_ids, features_only=True)
    out_1 = F.relu(self.linear_1(transformer_out))
    out_1 = self.dropout(out_1)
    logits = self.classification_head(out_1)
    return logits

In [0]:
class XLNetForJigSaw(XLNetPreTrainedModel):
    def __init__(self, config, out_dim):
        
        super(XLNetForJigSaw, self).__init__(config)
        self.attn_type = config.attn_type
        self.same_length = config.same_length
        self.summary_type = "last"

        self.transformer = XLNetModel('xlnet-base-cased', output_attentions=False, keep_multihead_output=False)
        self.dense = nn.Linear(config.d_model, config.d_model)
        self.activation = nn.Tanh()
        self.linear = nn.Linear(config.d_model, out_dim, bias=True)
        self.apply(self.init_xlnet_weights)

    def forward(self, input_ids, seg_id=None, input_mask=None,
                mems=None, perm_mask=None, target_mapping=None, inp_q=None,
                target=None, output_all_encoded_layers=True, head_mask=None, **kargs):

        output, hidden_states, new_mems = self.transformer(input_ids, seg_id, input_mask,
                                            mems, perm_mask, target_mapping, inp_q,
                                            output_all_encoded_layers, head_mask)
        first_token_tensor = output[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)

        return self.linear(pooled_output)

# Engine GPU

In [0]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

In [0]:
def train_fn(data_loader, model, optimizer, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]

        ids = ids.to(device, dtype=torch.long)
     
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

In [0]:
def eval_fn(data_loader, model):

    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids = d["input_ids"]
        

            
            targets = d["targets"]
            input_ids = input_ids.to(device, dtype=torch.long)

            mask = mask.to(device, dtype=torch.long)
      

            outputs = model(
                input_ids
                )
            targets = targets.cpu().detach().numpy().tolist()
            outputs = outputs.cpu().detach().numpy().tolist()
            fin_targets.extend(targets)
            fin_outputs.extend(outputs)    
    return fin_outputs, fin_targets

In [0]:
DEVICE =torch.device("cuda")
device = torch.device("cuda")
def run(model,EPOCHS):
    train_dataset = Dataset_class(df_train.comment_text,tokenizer,128,df_train.toxic,model_name='roberta')

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )
    valid_dataset = Dataset_class(df_test.comment_text,tokenizer,128,df_test.toxic,model_name='roberta')

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda")
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    num_train_steps = int(len(train_data_loader)) * EPOCHS
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )


    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model)
        outputs = np.array(outputs)
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        scheduler.step()
        if accuracy > best_accuracy : 
          torch.save(model.state_dict(), "/content/drive/My Drive/models/XLM-roberta/xlm_roberta_model_2.bin")

# Data preprocessing 

## Loading Data and simple EDA

In [0]:
import imblearn
print(imblearn.__version__)

from imblearn.under_sampling import (RandomUnderSampler, 
                                     ClusterCentroids,
                                     TomekLinks,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)

In [0]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(train1['comment_text'].values.reshape(-1, 1),train1['toxic'].values.reshape(-1, 1) )
train1 = pd.DataFrame(X_under,columns={"comment_text"})
train1['toxic'] = y_under
train1['toxic'].value_counts()

In [0]:
# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(train2['comment_text'].values.reshape(-1, 1),train2['toxic'].values.reshape(-1, 1) )
train2 = pd.DataFrame(X_under,columns={"comment_text"})
train2['toxic'] = y_under
train2['toxic'].value_counts()

In [0]:
df_train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']]
])
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [0]:
del train1, train2
import gc; gc.collect();

df_train.shape, df_valid.shape

In [0]:
df_valid['comment_text'] = df_valid['translated']

In [0]:
df_valid = df_valid[['comment_text','toxic']]

# Train Phase 1


# Run XLM 

In [0]:
MAX_Len = 192
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8 
EPOCHS = 2
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

HBox(children=(IntProgress(value=0, description='Downloading', max=5069051, style=ProgressStyle(description_wi…




In [0]:
df_train,df_test = train_test_split(train,test_size = 0.1 , random_state = 42 , stratify=train.toxic.values)

In [0]:
device = torch.device("cuda")
model = XLMRClassification()
model = model.to(device)

In [0]:
run(model,1)

  0%|          | 0/12575 [00:00<?, ?it/s]

KeyError: ignored

In [0]:
MAX_Len = 192
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 8 
EPOCHS = 2
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=True)

# Train Phase 2

In [0]:
valid = pd.read_csv('/content/drive/My Drive/comment/jigsaw_miltilingual_valid_translated.csv')
valid['toxic'] = (valid['toxic']>0.5).astype(int)
df_valid = valid.copy()
valid['comment_text'] = valid['translated']
valid=valid[['comment_text','toxic']]
valid= valid.append(df_valid[['comment_text','toxic']])
df_train , df_valid = train_test_split(valid,test_size = 0.1 , random_state = 42 , stratify=valid.toxic.values)

In [0]:
model.load_state_dict(torch.load('/content/drive/My Drive/models/XLM-roberta/xlm_roberta_model.bin'))
model = model.to(device)

RuntimeError: ignored

In [0]:
MAX_Len = 192
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8 
EPOCHS = 2
tokenizer = transformers.BertTokenizer.from_pretrained('/content/drive/My Drive/models')

In [0]:
getattr(tqdm, '_instances', {}).clear()
run(model,4)

100%|██████████| 900/900 [05:05<00:00,  2.95it/s]
100%|██████████| 200/200 [00:10<00:00, 18.54it/s]


Accuracy Score = 0.7437162997922446


100%|██████████| 900/900 [05:05<00:00,  2.95it/s]
100%|██████████| 200/200 [00:10<00:00, 18.52it/s]


Accuracy Score = 0.521680717176448


100%|██████████| 900/900 [05:05<00:00,  2.95it/s]
100%|██████████| 200/200 [00:10<00:00, 18.52it/s]


Accuracy Score = 0.841053608098858


100%|██████████| 900/900 [05:05<00:00,  2.95it/s]
100%|██████████| 200/200 [00:10<00:00, 18.43it/s]


Accuracy Score = 0.7807760204633066


In [0]:
valid = pd.read_csv('/content/drive/My Drive/comment/jigsaw_miltilingual_valid_translated.csv')
valid['toxic'] = (valid['toxic']>0.5).astype(int)
df_valid = valid.copy()
valid['comment_text'] = valid['translated']

In [0]:
df_train , df_valid = train_test_split(valid,test_size = 0.1 , random_state = 42 , stratify=valid.toxic.values)

In [0]:
model = BERTBaseUncased()
model.load_state_dict(torch.load('/content/drive/My Drive/models/Bert Model/model_bert.bin'))
model = model.to(device)

In [0]:
getattr(tqdm, '_instances', {}).clear()
run(model,4)

# Pseudo labeling 

# Make Submission 

## bert 

In [0]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()

        self.bert = transformers.BertModel.from_pretrained('/content/drive/My Drive/models')
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
        
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)

        bo = self.bert_drop(cat)
        p2 = self.out(bo)
        return p2

In [0]:
model = BERTBaseUncased()
model.load_state_dict(torch.load('/content/drive/My Drive/models/model_1.bin'))
model = model.to(device)

In [0]:
device = torch.device("cuda")
model = model.to(device)

In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained("/content/drive/My Drive/models", do_lower_case=True)


In [0]:

def prediction (data_loader, model):
    model.eval()
    fin_outputs = []
    indexs=[]
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]

            mask = d["mask"]
            index = d['id']
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            mask = mask.to(device, dtype=torch.long)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            outputs = outputs.cpu().detach().numpy().tolist()
            fin_outputs.extend(outputs)   
            indexs.extend(index.tolist()) 
    return fin_outputs,indexs

In [0]:
df_test = pd.read_csv('/content/drive/My Drive/comment/jigsaw_miltilingual_test_translated.csv')
df_test['comment_text'] = df_test['translated']

In [0]:
Max_len = 128


In [0]:
test_dataset = BertDatasetTest(df_test)

train_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    num_workers=4
)
o,i = prediction(train_data_loader,model)
sub = pd.DataFrame() 
sub['id'] = i 
sub['toxic'] = o

def f(x) : 
  return 1 /(1+np.exp(-x[0]))
sub['toxic'] = sub['toxic'].apply(f)


100%|██████████| 3989/3989 [04:15<00:00, 15.62it/s]


In [0]:
df_test

Unnamed: 0,id,content,lang,translated,comment_text
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr,Title named Doctor Who wiki 12. doctor has add...,Title named Doctor Who wiki 12. doctor has add...
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru,"It is possible, but I don't see the need to a...","It is possible, but I don't see the need to a..."
2,2,"Quindi tu sei uno di quelli conservativi , ...",it,"Then you're one of those conservative , who wo...","Then you're one of those conservative , who wo..."
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr,"Unfortunately, it was not performed, but had s...","Unfortunately, it was not performed, but had s..."
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr,:Resim:Seldabagcan.jpg the image of the source...,:Resim:Seldabagcan.jpg the image of the source...
...,...,...,...,...,...
63807,63807,"No, non risponderò, come preannunciato. Prefer...",it,"No, I will not answer, as predicted. I prefer ...","No, I will not answer, as predicted. I prefer ..."
63808,63808,"Ciao, I tecnici della Wikimedia Foundation sta...",it,"Hello, the technicians of The Wikimedia Founda...","Hello, the technicians of The Wikimedia Founda..."
63809,63809,innnazitutto ti ringrazio per i ringraziamenti...,it,innnazitutto thank you for the thanks!! ) is o...,innnazitutto thank you for the thanks!! ) is o...
63810,63810,Kaç olumlu oy gerekiyor? Şu an 7 oldu. Hayır...,tr,How many affirmative votes are required? It's...,How many affirmative votes are required? It's...


In [0]:
!ls 

 ber_submission1.csv   config.json   pytorch_model.bin
 ber_submission.csv    model_1.bin   vocab.txt
'Bert Model'	       model_1.zip   XLM-roberta
 bert_submission.csv   model.zip     xlm_submission.csv


In [0]:
sub.to_csv('ber_submission1.csv',index=False)

In [0]:
sub

Unnamed: 0,id,toxic
0,0,0.000823
1,1,0.000124
2,2,0.164764
3,3,0.000113
4,4,0.000111
...,...,...
63807,63807,0.760219
63808,63808,0.000087
63809,63809,0.177368
63810,63810,0.000083


## xlm 

In [0]:
model = CustomRoberta()
model = nn.DataParallel(model)
tokenizer = transformers.XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model.load_state_dict(torch.load('/content/drive/My Drive/models/XLM-roberta/xlm_roberta_model.bin'))
model = model.to(device)

In [0]:
def prediction_xlm (data_loader, model):
    model.eval()
    fin_outputs = []
    indexs=[]
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            index = d['id']
            ids = ids.to(device, dtype=torch.long)

            mask = mask.to(device, dtype=torch.long)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
            )
            outputs = outputs.cpu().detach().numpy().tolist()
            fin_outputs.extend(outputs)   
            indexs.extend(index.tolist()) 
    return fin_outputs,indexs

In [0]:
test_dataset = xlmDatasetTest(df_test)

train_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=16,
    num_workers=4
)
o,i = prediction_xlm(train_data_loader,model)
sub = pd.DataFrame() 
sub['id'] = i 
sub['toxic'] = o

def f(x) : 
  return 1 /(1+np.exp(-x[0]))
sub['toxic'] = sub['toxic'].apply(f)


100%|██████████| 3989/3989 [06:43<00:00,  9.89it/s]


# Xln