In [None]:
!pip install transformers datasets evaluate tensorflow keras

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from google.colab import files
import io
import warnings

In [19]:
warnings.filterwarnings('ignore')

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [10]:
uploaded = files.upload()

Saving dev.csv to dev.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [11]:
train_df = pd.read_csv(io.BytesIO(uploaded['train.csv']))
dev_df = pd.read_csv(io.BytesIO(uploaded['dev.csv']))
test_df = pd.read_csv(io.BytesIO(uploaded['test.csv']))

In [12]:
def get_vectorized_data(reviews_train, reviews_test):
    vectorizer = CountVectorizer()
    vectorizer.fit(reviews_train)
    x_train = vectorizer.transform(reviews_train)
    x_test = vectorizer.transform(reviews_test)
    return x_train, x_test

In [49]:
all_df = pd.concat([train_df, dev_df, test_df]).reset_index(drop=True)
all_df = all_df.sample(frac=1).reset_index(drop=True)
all_df = all_df[:20000]
reviews = all_df['text'].values
labels = all_df['label'].values


reviews_train, reviews_test, y_train, y_test = train_test_split(
    reviews, labels, test_size=0.15, random_state=1000)

**Logistic Regression and Simple Neural Network**

In [24]:
x_train, x_test = get_vectorized_data(reviews_train, reviews_test)

classifier = LogisticRegression()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Logistic Regression results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

input_dim = x_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(x_train, y_train, epochs=3, verbose='auto',batch_size=100)
clear_session()
y_pred = model.predict(x_test, batch_size=1250)
y_pred = np.where(y_pred > 0.5, 1, 0)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Simple Neural Network results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

Logistic Regression results: 
accuracy: 0.896, precision: 0.8978919631093544, recall: 0.8967105263157895, f1 score: 0.8973008558262016
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                748070    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 748,081
Trainable params: 748,081
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3




Simple Neural Network results: 
accuracy: 0.9076666666666666, precision: 0.9001931745009659, recall: 0.9197368421052632, f1 score: 0.909860071591279


**Simple Neural Networks with Embedding Layer**

In [25]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews_train)
x_train = tokenizer.texts_to_sequences(reviews_train)
x_test = tokenizer.texts_to_sequences(reviews_test)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_dim = 128

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=5, verbose='auto',batch_size=100, validation_split=0.0)
clear_session()
y_pred = model.predict(x_test, batch_size=1250)
y_pred = np.where(y_pred > 0.5, 1, 0)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Simple Neural Networks with Embedding Layer results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          10370432  
                                                                 
 flatten (Flatten)           (None, 12800)             0         
                                                                 
 dense (Dense)               (None, 10)                128010    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 10,498,453
Trainable params: 10,498,453
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Simple Neural Networks with Embedding Layer results: 
accuracy: 0.892, precision: 0.888816644993498, recall: 0.8993421052

**Convolutional Neural Network with Simple Neural Network and Embedding Layer**




In [26]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews_train)
x_train = tokenizer.texts_to_sequences(reviews_train)
x_test = tokenizer.texts_to_sequences(reviews_test)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_dim = 128
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Dropout(0.05))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, epochs=6, verbose='auto',batch_size=100, validation_split=0.1)
clear_session()
y_pred = model.predict(x_test, batch_size=1250)
y_pred = np.where(y_pred > 0.5, 1, 0)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Convolutional Neural Network with Simple Neural Network and Embedding Layer results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          10370432  
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           82048     
                                                                 
 dropout (Dropout)           (None, 96, 128)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                        

**LSTM Network with Simple Neural Network and Embedding Layer**

In [27]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(reviews_train)
x_train = tokenizer.texts_to_sequences(reviews_train)
x_test = tokenizer.texts_to_sequences(reviews_test)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

embedding_dim = 128
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           mask_zero=True))
#model.add(layers.Bidirectional(layers.LSTM(128, kernel_regularizer='l2')))
model.add(layers.LSTM(128, kernel_regularizer='l2', return_sequences=True))
model.add(layers.Dropout(0.05))
model.add(layers.LSTM(128, kernel_regularizer='l2'))
model.add(layers.Dense(10, activation='relu', kernel_regularizer='l2'))
model.add(layers.Dense(1))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, epochs=8, verbose='auto',batch_size=100, validation_split=0.1)
clear_session()
y_pred = model.predict(x_test, batch_size=1250)
y_pred = np.where(y_pred > 0.5, 1, 0)
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("LSTM Network with Simple Neural Network and Embedding Layer results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         10370432  
                                                                 
 lstm (LSTM)                 (None, None, 128)         131584    
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 10,634,901
Trainable params: 10,634,901
No

**Pre-trained Language Model**



In [50]:
all_dataset = {}
train_dict = {"text": reviews_train, "label":y_train}
test_dict = {"text": reviews_test, "label":y_test}
all_dataset['train'] = Dataset.from_dict(train_dict)
all_dataset['test'] = Dataset.from_dict(test_dict)
all_dataset = DatasetDict(all_dataset)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = all_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [51]:
accuracy_metric = evaluate.load('accuracy')
recall_metric = evaluate.load('recall')
precision_metric = evaluate.load('precision')
f1_metric = evaluate.load('f1')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [52]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
training_args = TrainingArguments(
    output_dir="./transformers-results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when in

Step,Training Loss
500,0.3865
1000,0.2857
1500,0.2128
2000,0.1886
2500,0.1373
3000,0.1146


Saving model checkpoint to ./transformers-results/checkpoint-500
Configuration saved in ./transformers-results/checkpoint-500/config.json
Model weights saved in ./transformers-results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./transformers-results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./transformers-results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./transformers-results/checkpoint-1000
Configuration saved in ./transformers-results/checkpoint-1000/config.json
Model weights saved in ./transformers-results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./transformers-results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./transformers-results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./transformers-results/checkpoint-1500
Configuration saved in ./transformers-results/checkpoint-1500/config.json
Model weights saved in ./transformers-results/checkpoint-

TrainOutput(global_step=3189, training_loss=0.21404326107854116, metrics={'train_runtime': 2100.3589, 'train_samples_per_second': 24.282, 'train_steps_per_second': 1.518, 'total_flos': 5951563590319200.0, 'train_loss': 0.21404326107854116, 'epoch': 3.0})

In [53]:
predictions = trainer.predict(tokenized_dataset["test"])
preds = np.argmax(predictions.predictions, axis=-1)

acc = accuracy_metric.compute(predictions=preds, references=predictions.label_ids)['accuracy']
prec = precision_metric.compute(predictions=preds, references=predictions.label_ids)['precision']
rec = recall_metric.compute(predictions=preds, references=predictions.label_ids)['recall']
f1 = f1_metric.compute(predictions=preds, references=predictions.label_ids)['f1']

print("Pre-trained Language Model results: ")
print(f"accuracy: {acc}, precision: {prec}, recall: {rec}, f1 score: {f1}")

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Pre-trained Language Model results: 
accuracy: 0.9126666666666666, precision: 0.89, recall: 0.9322625698324022, f1 score: 0.9106412005457025
