In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import io
import pandas as pd

In [None]:
path = "/content/drive/MyDrive/dep-nlp/"
df_train = pd.read_csv(path+'data/train.csv')
df_train

In [None]:
possible_labels = df_train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)
df_train['label'] = df_train.label.replace(label_dict)
# dev_clean['label'] = dev_clean.label.replace(label_dict)


In [None]:
df_train.label.unique()

In [None]:
!pip install simpletransformers

In [None]:
import pandas as pd
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

np.set_printoptions(threshold=np.inf)

In [None]:
df_train = df_train[['text', 'label']]
df_train.head()

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re

def preprocess(sentence):
  sentence=str(sentence)
  sentence=sentence.lower()
  cleanr=re.compile('<.*?>')
  cleantext=re.sub(cleanr,'',sentence)
  rem_url=re.sub(r'http\S+','',cleantext)
  rem_num=re.sub('[0-9]+','',rem_url)
  rem_tag=re.sub(r'@\S+','',rem_num)
  tokenizer=RegexpTokenizer(r'\w+')
  tokens=tokenizer.tokenize(rem_tag)
  filtered_words=[w for w in tokens if len(w)>2 if not w in stopwords.words('english')]
  return " ".join(filtered_words)

df_train['text']=df_train['text'].map(lambda s:preprocess(s))
df_train.head(20)

In [None]:
model=ClassificationModel('roberta','roberta-base',num_labels=3,use_cuda=True,args={
        "reprocess_input_data" : True,
        "use_cached_eval_features":False, 
        "overwrite_output_dir": True, 
        "num_train_epochs": 1}
)

In [None]:
model.train_model(df_train)

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
df_eval = pd.read_csv(path+'data/dev.csv')
df_eval = df_eval[['text', 'label']]
df_eval['label'] = df_eval.label.replace(label_dict)
print(df_eval)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(df_eval)
print(result)

In [None]:
print(wrong_predictions)

In [None]:
model_outputs

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
df_test= df_eval
print(df_test)

In [None]:
predictions, raw_outputs = model.predict(df_test['text'].tolist())
print(predictions)
print(raw_outputs)


In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore',category=FutureWarning)
from sklearn.metrics import classification_report
print(classification_report(df_test.label,predictions))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_test.category,predictions))

# DistliBERT test

In [None]:
!pip install transformers &> /dev/null

In [None]:
import os
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm # Progress Bar
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
import transformers
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import warnings
from transformers import logging as hf_logging
hf_logging.set_verbosity_error() # Hidding Huggingface Warnings
warnings.filterwarnings("ignore")

In [None]:
MODEL_NAME = 'distilbert-base-cased'

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) # Loading the tokenizer


In [None]:
path = "/content/drive/MyDrive/dep-nlp/"
train = pd.read_csv(path+'clean-data/train-preprocess.csv')
dev = pd.read_csv(path+'clean-data/dev-preprocess.csv')

In [None]:
train = train[['text', 'label']]
dev = dev[['text', 'label']]

In [None]:
possible_labels = train.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

train['label'] = train.label.replace(label_dict)
dev['label'] = dev.label.replace(label_dict)

In [None]:
train.head()

In [None]:
MAX_LENGTH = 250

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME,  
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH, 
                                                pad_to_max_length=True)

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, 
                                       add_special_tokens=True, 
                                       max_length=MAX_LENGTH, 
                                       pad_to_max_length=True, 
                                       return_attention_mask=True, 
                                       return_token_type_ids=True, 
                                       truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [None]:
config = DistilBertConfig.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)
DistilBERT = TFDistilBertModel.from_pretrained(MODEL_NAME, config=config)

input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='masked_token', dtype='int32') 

embedding_layer = DistilBERT(input_ids = input_ids_in, attention_mask = input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(3, activation='softmax')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
    layer.trainable = False

model.summary()

In [None]:
x_train = train['text']
x_val = dev['text']

In [None]:
X_train = tokenize(x_train, tokenizer)
X_val = tokenize(x_val, tokenizer)

In [None]:
y_train = train['label'].to_list()
y_val = dev['label'].to_list()

In [None]:
# model_checkpoint = ModelCheckpoint(filepath=output_dir+'/weights.{epoch:02d}.hdf5', save_weights_only=True)

early_stopping = EarlyStopping(patience=3, # Stop after 3 epochs of no improvement
                               monitor='val_loss', # Look at validation_loss
                               min_delta=0, # After 0 change
                               mode='min', # Stop when quantity has stopped decreasing
                               restore_best_weights=False, # Don't Restore the best weights
                               verbose=1) 

reduce_lr = ReduceLROnPlateau(monitor='val_loss', # Look at validation loss
                              min_lr=0.000001, # Lower bound of learning rate
                              patience=1, # Reduce after 1 with little change
                              mode='min', # Stop when quantity has stopped decreasing
                              factor=0.1, # Reduce by a factor of 1/10
                              min_delta=0.01, # Minimumn change needed
                              verbose=1)

In [None]:
X_val

In [None]:
y_val

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, 
                    y_train, 
                    epochs = 10,
                    batch_size=16, 
                    validation_data=(X_val, y_val))

### GPU tests

In [None]:
!cat /proc/cpuinfo

In [None]:
from psutil import *
virtual_memory()

In [None]:
#GPU count and name
!nvidia-smi -L

In [None]:
#use this command to see GPU activity while doing Deep Learning tasks, for this command 'nvidia-smi' and for above one to work, go to 'Runtime > change runtime type > Hardware Accelerator > GPU'
!nvidia-smi

In [None]:
!cat /proc/meminfo

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore',category=FutureWarning)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_test.label,predictions))

In [None]:
df_tes = pd.read_csv('test_data.tsv', sep='\t')
df_tes['category'] = predictions
df_tes.to_csv('scube_run2.csv')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df_tes = pd.read_csv('roberta.csv')
df_tes.to_csv('roberta.tsv', sep='\t',index=False)
df_tes.head(20)

In [None]:
df_tes['category'].value_counts()

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(8,8)})
#sns.countplot(trainDF_Sheet_1['category'])
sns.countplot(df_test['category'])

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

In [None]:
df_tes = pd.read_csv('roberta.csv')
df_tes.to_csv('scube_run2.tsv', sep='\t',index=False)
df_tes.head(20)