## NLP models

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt') # takes abbreviations into account
nltk.download('stopwords')
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC, SVC
import tensorflow
import keras
import matplotlib.pyplot as plt


In [None]:
# import dataframe
df = pd.read_csv('balanced_df.csv')
df.head()

Split train/test data

In [None]:
df = pd.read_csv('drugs_clean_train.csv')


In [None]:
import ast
df['stemmed'] = df['stemmed'].apply(ast.literal_eval)  # Safely evaluate string as a list
df['stemmed'] = df['stemmed'].apply(lambda x: ' '.join(x))

In [None]:
df

In [None]:
x = df['stemmed']
y = df['class']

# split data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1000,stratify=y)

In [None]:
df

Encode variables

In [None]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

Vectorization

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(1,8),max_features=2000)
tfidf_vec.fit(x)

x_train_tfidf = tfidf_vec.transform(x_train)
x_test_tfidf = tfidf_vec.transform(x_test)

In [None]:
print(tfidf_vec.vocabulary_)

### Naive Bayes

In [None]:
nbc = naive_bayes.MultinomialNB()
nbc.fit(x_train_tfidf,y_train)

Predict

In [None]:
predict_nbc = nbc.predict(x_test_tfidf)
print(f'NBC accuracy: {accuracy_score(predict_nbc,y_test)*100:.2f}%')

### Support Vector Machine

In [None]:
clf = LinearSVC()
clf.fit(x_train_tfidf,y_train)

Predict

In [None]:
predict_svc = clf.predict(x_test_tfidf)
print(f'SVC accuracy: {accuracy_score(predict_svc,y_test)*100:.2f}%')

### Neural Network Machine

Keras/Tensorflow

In [None]:
a = df['stemmed']
b = df['class']

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 2000
max_len = 50

# Tokenizer setup
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['stemmed'])

a_sequences = tokenizer.texts_to_sequences(df['stemmed']) # token to sequence
a_padded = pad_sequences(a_sequences, maxlen=max_len, padding='post', truncating='post') # input size

b_encoded = df['class'].values - 1

a_train, a_test, b_train, b_test = train_test_split(a_padded, b_encoded, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

num_feature = a_padded.shape[1] # input sequence
num_classes = len(np.unique(b_encoded)) # classes

model = Sequential([
    Dense(128, activation='relu', input_shape=(num_feature,)),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

# compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
hist = model.fit(a_train, b_train, validation_data=(a_test, b_test), epochs=10, batch_size=32)

In [None]:
test_loss, test_accuracy = model.evaluate(a_test, b_test)
print("Validation Accuracy:", test_accuracy)

In [None]:
# Summarize history for accuracy
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### BERT model

Accounts for context and nuances in language 

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from joblib import Parallel, delayed

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

def batch(start_index,end_index,input_ids,attention_mask,model):
    batch_input_ids = input_ids[start_index:end_index].to(device)
    batch_attention_mask = attention_mask[start_index:end_index].to(device)
    batch_inputs = {'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask}

    with torch.no_grad():
        output = model(**batch_inputs)
    return output.logits.cpu()

text = df['review']
encoded_input = tokenizer(text.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)

batchsize = 10
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']

total_batch = (len(input_ids) + batchsize - 1)//batchsize

results = Parallel(n_jobs=-1)(
    delayed(batch)(i * batchsize, min((i + 1) * batchsize, len(input_ids)), input_ids, attention_mask, model)
    for i in range(total_batch))

all_scores = torch.cat(results,dim=0)
print(all_scores)

code above took 483.46 mins to run.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 

X_train = scaler.fit_transform(all_scores.numpy())
Y_train = df['rating']

In [None]:
# save bert scores to avoid running it again
torch.save(all_scores, 'bert_scores.pt')

In [None]:
# save to dataframe
scores_list = [scores.tolist() for scores in all_scores]
df['distilbert_scores'] = scores_list

In [None]:
from sklearn.preprocessing import StandardScaler

# convert from tensor to numpy array
logits_numpy = all_scores.numpy()

# scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(logits_numpy)

In [None]:
X = all_scores
Y = df['rating']

In [None]:
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

#### Naive Bayes Model with BERT

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

In [None]:
predictions = gnb.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

#### SVM model with BERT

In [None]:
svc = SVC(kernel='rbf')
svc.fit(X_train, Y_train)

In [None]:
predictions = svc.predict(X_test)

accuracy = accuracy_score(Y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

#### Neural Network Model with BERT

In [None]:
a = all_scores.numpy()
b = df['class']

#split
a_train,a_test,b_train,b_test = train_test_split(a,b,test_size=0.2,random_state=42)

In [None]:
b_train -= 1
b_test -= 1 

In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

num_feat = a.shape[1] # from distilbert
num_class = len(np.unique(np.concatenate([b_train, b_test]))) # all ratings

model = Sequential([
    Dense(128, activation='relu', input_shape=(num_feat,)),
    Dense(num_class, activation='softmax') # softmax for multi-class classification
])

# compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
# train
history = model.fit(a_train,b_train,validation_data=(a_test,b_test),epochs=10,batch_size=32)

In [None]:
test_loss, test_accuracy = model.evaluate(a_test, b_test)
print("Validation Accuracy:", test_accuracy)

In [None]:
import matplotlib.pyplot as plt

# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()