# Libraries

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
!pip install contractions 

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import re
import string
import math
import contractions
from collections import Counter

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

# Reading data

In [None]:
path = "/content/gdrive/MyDrive/dep-nlp/"

In [None]:
train_data = pd.read_csv(path+'data/train.csv')
train_data.head()

In [None]:
# train_data = train_data.rename(columns={'PID': 'id', 'Text_data': 'text', 'Label': 'label'})

In [None]:
test_data = pd.read_csv(path+'data/test.csv')
test_data.head()

In [None]:
# test_data = test_data.rename(columns={'Pid': 'id', 'text data': 'text'})

In [None]:
dev_data = pd.read_csv(path+'data/dev.csv')
dev_data.head()

In [None]:
# dev_data = dev_data.rename(columns={'PID': 'id', 'Text data': 'text', 'Label': 'label'})

In [None]:
train_data.to_csv('train.csv',index=False)
dev_data.to_csv('dev.csv', index=False)
test_data.to_csv('test.csv', index=False)

In [None]:
possible_labels = train_data.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

In [None]:
train_data['label'] = train_data.label.replace(label_dict)
dev_data['label'] = dev_data.label.replace(label_dict)

In [None]:
from collections import Counter

print('train')
print(Counter(train_data['label']).keys()) # unique elements
print(Counter(train_data['label']).values()) # counts the elements' frequency

print('dev')
print(Counter(dev_data['label']).keys()) # unique elements
print(Counter(dev_data['label']).values()) # counts the elements' frequency

In [None]:
dev_data.head()

# Preprocessing

In [None]:
def preprocess(text):

  text=contractions.fix(text) # expanding the contractions
  text=text.lower() # lowering the text
  text=re.sub(r'[^\w\s]', '', text) # removing the punctuation
  text=''.join([i for i in text if not i.isdigit()]) # removing the digits and words containing the digits
  text=" ".join([word for word in str(text).split() if word not in stop_words]) # removing the stopwords
  text=" ".join([lemmatizer.lemmatize(word) for word in text.split()]) # lemmatizing the words

  return text

In [None]:
train_data['text'] = train_data['text'].apply(lambda x: preprocess(x))

In [None]:
test_data['text'] = test_data['text'].apply(lambda x: preprocess(x))

In [None]:
dev_data['text'] = dev_data['text'].apply(lambda x: preprocess(x))

In [None]:
# train_data.iloc[0]['text']

In [None]:
train_data.to_csv('train-preprocess.csv')
test_data.to_csv('test-preprocess.csv')
dev_data.to_csv('dev-preprocess.csv')

In [None]:
train_data.head()

# Generate embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel

### Mental-bert
https://huggingface.co/mental/mental-bert-base-uncased?text=The+goal+of+life+is+%5BMASK%5D.

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
model1 = AutoModel.from_pretrained("mental/mental-bert-base-uncased")

### MentalRoBERTa
https://huggingface.co/mental/mental-roberta-base?text=The+goal+of+life+is+%3Cmask%3E.

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("mental/mental-roberta-base")
model2 = AutoModel.from_pretrained("mental/mental-roberta-base")

### Bio+Clinical BERT
https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT?text=Paris+is+the+%5BMASK%5D+of+France.

In [None]:
tokenizer3 = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model3 = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Looping for embeddings:


* embedding1-label1: mental-bert
* embedding2-label2: mental-roberta
* embedding3-label3: bioclinical-bert



In [None]:
embedding1 = []
label1=[]
embedding2 = []
label2=[]
embedding3 = []
label3 = []

In [None]:
counter = 0

for i, j in zip(dev_data['text'], dev_data['label']):
  if(counter%500==0):print(counter)
  counter = counter+1
  try:
    input1 = tokenizer1(i, return_tensors="pt", padding=True, truncation=True)
    output1 = model1(**input1)

    input2 = tokenizer2(i, return_tensors='pt', padding=True, truncation=True)
    output2 = model2(**input2)

    input3 = tokenizer3(i, return_tensors="pt", padding=True, truncation=True)
    output3 = model3(**input3)

    embedding1.append(output1.last_hidden_state[0][0].detach().numpy())
    embedding2.append(output2.last_hidden_state[0][0].detach().numpy())
    embedding3.append(output3.last_hidden_state[0][0].detach().numpy())
    label1.append(j)
    label2.append(j)
    label3.append(j)
    # print(outputs.last_hidden_state.shape)
  except:
    continue

In [None]:
print(len(label1), len(label2), len(label3))

In [None]:
print(len(embedding1))

In [None]:
print(len(embedding1[0]))

In [None]:
np.save(open('dev-mental-bert-emb.npy','wb'),np.array(embedding1))
np.save(open('dev-mental-roberta-emb.npy','wb'),np.array(embedding2))
np.save(open('dev-clinical-bert-emb.npy','wb'),np.array(embedding3))

In [None]:
np.save(open('dev-mental-bert-label.npy','wb'),np.array(label1))
np.save(open('dev-mental-roberta-label.npy','wb'),np.array(label2))
np.save(open('dev-clinical-bert-label.npy','wb'),np.array(label3))

In [None]:
print(label1)

In [None]:
np.save(open('dev-stack-label.npy','wb'),np.array(label1))

In [None]:
# stacking the embeddings
stack = [sum(x)/ 3 for x in zip(embedding1, embedding2, embedding3)]

In [None]:
print(type(stack))
print(len(stack))

In [None]:
np.save(open('dev-stack-emb.npy','wb'),np.array(stack))

### Checking occurences

In [None]:
print(Counter(label1).keys()) # unique elements
print(Counter(label1).values()) # counts the elements' frequency

### Some data modifications

In [None]:
# train_emb = np.load(path+'emb/train/train-stack-emb.npy')
# train_label = np.load(path+'emb/train/train-clinical-bert-label.npy')

In [None]:
# dev_emb = np.load(path+'emb/dev/dev-stack-emb.npy')
# dev_label = np.load(path+'emb/dev/dev-stack-label.npy')

In [None]:
# np.unique(train_label)

In [None]:
# type(train_label)

In [None]:
# train_label = np.load(path+'emb/train/train-mental-roberta-label.npy')

In [None]:
'''
def convert_labels(arr):
    for i in range(len(arr)):
        if arr[i]=='moderate':
            arr[i]=0
        elif arr[i]=='not depression':
            arr[i]=1
        elif arr[i]=='severe':
            arr[i]=2

convert_labels(train_label)
'''


In [None]:
# np.unique(train_label)

In [None]:
# train_label = train_label.astype(np.int64)

In [None]:
# np.unique(train_label)

In [None]:
# np.save('train-mental-roberta-label.npy', np.array(train_label))

In [None]:
# np.unique(dev_label)

# SVM

### For stacked embeddings

In [None]:
train_emb = np.load(path+'emb/train/train-stack-emb.npy')
train_label = np.load(path+'emb/train/train-stack-label.npy')

In [None]:
dev_emb = np.load(path+'emb/dev/dev-stack-emb.npy')
dev_label = np.load(path+'emb/dev/dev-stack-label.npy')

In [None]:
svm_classifier = OneVsRestClassifier(SVC())
svm_classifier.fit(train_emb, train_label)

In [None]:
pred = svm_classifier.predict(dev_emb)

In [None]:
print("Accuracy for stacked embeddings (without balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred, dev_label))
print(classification_report(pred, dev_label))

# Balancing using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()

In [None]:
train_emb_smt, train_label_smt = smt.fit_resample(train_emb, train_label)

In [None]:
print('Before:', Counter(train_label))
print('After:', Counter(train_label_smt))

In [None]:
svm_classifier.fit(train_emb_smt, train_label_smt)

In [None]:
pred_bal = svm_classifier.predict(dev_emb)

In [None]:
print("Accuracy for stacked embeddings (after balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred_bal, dev_label))
print(classification_report(pred_bal, dev_label))

### For mental-bert embeddings

In [None]:
train_emb1 = np.load(path+'emb/train/train-mental-bert-emb.npy')
train_label1 = np.load(path+'emb/train/train-mental-bert-label.npy')

In [None]:
dev_emb1 = np.load(path+'emb/dev/dev-mental-bert-emb.npy')
dev_label1 = np.load(path+'emb/dev/dev-mental-bert-label.npy')

In [None]:
svm_classifier1 = OneVsRestClassifier(SVC())
svm_classifier1.fit(train_emb1, train_label1)

In [None]:
pred1 = svm_classifier1.predict(dev_emb1)

In [None]:
print("Accuracy for mental-bert embeddings (without balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred1, dev_label1))
print(classification_report(pred1, dev_label1))

In [None]:
train_emb_smt1, train_label_smt1 = smt.fit_resample(train_emb1, train_label1)

In [None]:
svm_classifier.fit(train_emb_smt1, train_label_smt1)

In [None]:
pred_bal1 = svm_classifier.predict(dev_emb1)

In [None]:
print("Accuracy for stacked embeddings (after balancing)")
# Model Accuracy: how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(pred_bal1, dev_label1))
print(classification_report(pred_bal1, dev_label1))