In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Installing the Huggingface's Transformers library
!pip install transformers

In [None]:
!pip install datasets

In [None]:
# IMPORT LIBRARIES

# Utils
import os                                       # Operating system operations
import json                                     # Working with json file
import re                                       # Regular expression
import unicodedata                              # Unicode + regular expression
import random                                   # Random
import collections                              # Counter

# Computation
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# NLP
import nltk
import gensim
import gensim.downloader as gensim_api

# Transformers
import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Datasets
from datasets import load_dataset

# ML utils
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction, naive_bayes, pipeline, manifold, preprocessing
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

# PyTorch
import torch

# Others
from lime import lime_text

In [None]:
dirname = '/kaggle/input/nlp-getting-started'

train = pd.read_csv(os.path.join(dirname, 'train.csv'))
test = pd.read_csv(os.path.join(dirname, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(dirname, 'sample_submission.csv'))

In [None]:
sample_submission

In [None]:
test

In [None]:
def compute_metrics(pred):
    """
    This function computes metrics for Transformers' fine tuning
    
    Args:
        pred: predictions from Transformers' Trainer
    
    Returns:
        A dictionary that contains metrics of interest for binary classification:
            (1) Accuracy
            (2) Precision
            (3) Recall
            (4) F1 Score
            (5) AUC
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall, "auc": auc}

In [None]:
def tokenize(batch):
    """
    Tokenize by batches for Transformers
    """
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
def set_cuda_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [None]:
all_train_texts = train.text.to_list()
all_train_labels = train.target.to_list()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_train_texts, all_train_labels, 
    test_size=0.2, 
    random_state=692
)

In [None]:
train_df = pd.DataFrame(list(zip(train_texts, train_labels)),
                        columns =['text', 'label'])

val_df = pd.DataFrame(list(zip(val_texts, val_labels)),
                      columns =['text', 'label'])

In [None]:
test_df = test[['id', 'text']]

In [None]:
data_path = '/kaggle/working/data'
if not os.path.exists(data_path):
    os.makedirs(data_path)

train_df.to_csv(os.path.join(data_path, 'train_df.csv'), index=False)
val_df.to_csv(os.path.join(data_path, 'val_df.csv'), index=False)

tweets_dataset = load_dataset('csv', data_files={'train': os.path.join(data_path, 'train_df.csv'),
                                                 'validation': os.path.join(data_path, 'val_df.csv')})

In [None]:
test_df.to_csv(os.path.join(data_path, 'test_df.csv'), index=False)

tweets_test_dataset = load_dataset('csv', data_files = {'test': os.path.join(data_path, 'test_df.csv')})

In [None]:
tweets_dataset

In [None]:
tweets_test_dataset

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'bert-base-uncased'
num_labels = 2

tokenizer = AutoTokenizer.from_pretrained(model_name)
tweets_encoded = tweets_dataset.map(tokenize, batched=True, batch_size=None)
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels=num_labels)
         .to(device))

In [None]:
set_cuda_seed()

training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=2,             
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,
    evaluation_strategy="epoch",
    weight_decay=0.01,               
    logging_dir='./logs',       
    logging_steps=10,
)

trainer = Trainer(
    model=model,                      
    args=training_args,                 
    train_dataset=tweets_encoded["train"],
    eval_dataset=tweets_encoded["validation"],
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
tweets_test_encoded = tweets_test_dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
preds_raw = trainer.predict(tweets_test_encoded['test'])

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
predictions_prob = np.array(list(map(softmax, preds_raw.predictions)))[:,1]

In [None]:
submission = pd.DataFrame(list(zip(test['id'], [i for i in map(round, predictions_prob)])),
                          columns = ['id', 'target'])

In [None]:
import collections
collections.Counter(submission['target'])

In [None]:
submission.to_csv('submission.csv', index=False)