In [5]:
import os
import pandas as pd
import numpy as np
from catboost.text_processing import Tokenizer
import nltk
from nltk.corpus import stopwords, wordnet
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier, CatBoostRegressor
from catboost.utils import eval_metric
from tqdm.auto import tqdm
import pickle
SEED = 42
np.random.seed(SEED)
DATA_DIR = "../datasets"
MODEL_DIR = "../models"

In [6]:
source = os.path.join(DATA_DIR, "aclImdb_v1.tar.gz")

if not os.path.exists(source):
    !wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P $DATA_DIR

if not os.path.exists("/tmp/aclImdb"):
    !tar -xf $source -C /tmp

train_dir = "/tmp/aclImdb/train"
test_dir = "/tmp/aclImdb/test"

In [7]:
def extract_data(path):
    review = []
    for label in ["pos", "neg"]:
        files = os.listdir(os.path.join(path, label))
        for file in files:
            with open(os.path.join(path, label, file)) as f:
                text = f.readline()
                rating = file.split(".")[0].split("_")[1]
                review.append([text, label, int(rating)])
    df = pd.DataFrame(review, columns=["text", "label", "rating"])
    return df

train_df = extract_data(train_dir)
test_df = extract_data(test_dir)

In [8]:
class ImdbDataset():
    def __init__(self, vocab=None):
        super().__init__()
        self.tokenizer = Tokenizer(lowercasing=True, separator_type='BySense', token_types=['Word', 'Number'])
        nltk.download(["stopwords", "wordnet"], quiet=True)
        self.stop_words = stopwords.words("english")
        self.lemmatizer = nltk.stem.WordNetLemmatizer()

    def _stop_words_filter(self, tokens):
        return list(filter(lambda x: x not in self.stop_words, tokens))

    def _lemmatize(self, tokens):
        return list(map(lambda t: self.lemmatizer.lemmatize(t), tokens))

    def preprocess_text(self, texts):
        tokenized_text = [self.tokenizer.tokenize(text) for text in texts]
        tokenized_no_stop = [self._stop_words_filter(tokens) for tokens in tokenized_text]
        lemmatized_text = [" ".join(self._lemmatize(tokens)) for tokens in tokenized_no_stop]
        return lemmatized_text

    def from_df(self, data):
        return pd.DataFrame({
            "text": self.preprocess_text(data.text),
            "label": data.label,
            "rating": data.rating,
        })

dataset = ImdbDataset()

train_df = dataset.from_df(train_df)
test_df = dataset.from_df(test_df)

train_df, \
val_df = train_test_split(train_df, stratify=train_df.label, test_size=0.2, random_state=SEED)
train_df.shape, val_df.shape, test_df.shape

((20000, 3), (5000, 3), (25000, 3))

## Classifier
### CatBoost

In [5]:
train_pool = Pool(
    train_df[["text"]], 
    train_df.label, 
    text_features=["text"],
)
val_pool = Pool(
    val_df[["text"]],
    val_df.label, 
    text_features=["text"],
)
test_pool = Pool(
    test_df[["text"]],
    test_df.label, 
    text_features=["text"],
)
clf = CatBoostClassifier(
    eval_metric="Accuracy", 
    task_type="GPU",
    random_seed=SEED, 
)
clf.fit(train_pool, eval_set=val_pool, verbose=100)    
clf.score(test_pool)

Learning rate set to 0.0552
0:	learn: 0.8524500	test: 0.8596000	best: 0.8596000 (0)	total: 143ms	remaining: 2m 22s
100:	learn: 0.8738000	test: 0.8760000	best: 0.8760000 (98)	total: 3.55s	remaining: 31.6s
200:	learn: 0.8847000	test: 0.8826000	best: 0.8826000 (200)	total: 7.19s	remaining: 28.6s
300:	learn: 0.8935500	test: 0.8878000	best: 0.8878000 (300)	total: 10.4s	remaining: 24.1s
400:	learn: 0.9005000	test: 0.8920000	best: 0.8922000 (391)	total: 13.5s	remaining: 20.2s
500:	learn: 0.9065500	test: 0.8942000	best: 0.8944000 (483)	total: 16.5s	remaining: 16.5s
600:	learn: 0.9119000	test: 0.8978000	best: 0.8978000 (599)	total: 19.5s	remaining: 13s
700:	learn: 0.9153500	test: 0.8988000	best: 0.8988000 (658)	total: 22.5s	remaining: 9.61s
800:	learn: 0.9189500	test: 0.9000000	best: 0.9000000 (797)	total: 25.5s	remaining: 6.34s
900:	learn: 0.9231500	test: 0.8994000	best: 0.9004000 (860)	total: 28.6s	remaining: 3.14s
999:	learn: 0.9265500	test: 0.8994000	best: 0.9004000 (860)	total: 31.5s	remai

0.87556

### Bert

In [6]:
import datasets
from datasets import Dataset, concatenate_datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
checkpoint = "lvwerra/distilbert-imdb"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

def preprocess_function(examples):
    label_mapping = {"neg": 0, "pos": 1}
    inputs = tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")
    label = [label_mapping[label] for label in examples["label"]]
    return Dataset.from_dict({
        "text": examples["text"],
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "label": label,
        "rating": examples["rating"],
    })
train_ds = preprocess_function(Dataset.from_dict(train_df))
val_ds = preprocess_function(Dataset.from_dict(val_df))
test_ds = preprocess_function(Dataset.from_dict(test_df))

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def predict(model, data, bs=4, device=DEVICE):
    model.to(device)
    model.eval()
    input_ids = torch.tensor(data["input_ids"]).to(device)
    attention_mask = torch.tensor(data["attention_mask"]).to(device)
    logits = []
    for i in tqdm(range(0, len(data), bs)):
        batch = range(i, i+bs)
        out = model(input_ids[batch], attention_mask=attention_mask[batch])
        logits.append(out.logits.data.cpu().detach().numpy())
    return np.vstack(logits), data["label"]


def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
compute_metrics(predict(model, test_ds))

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

  0%|          | 0/6250 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.84876}

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
modules = [
    model.classifier,
    model.pre_classifier,
    model.distilbert.transformer.layer[-1:],
]
for param in model.parameters():
    param.requires_grad = False
for module in modules:
    for param in module.parameters():
        param.requires_grad = True

training_args = TrainingArguments(
        output_dir="bert_results",
        learning_rate=1e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
)
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate(test_ds)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3132,0.301227,0.8954
2,0.2814,0.299281,0.9084


{'eval_loss': 0.34711530804634094,
 'eval_accuracy': 0.89792,
 'eval_runtime': 571.2856,
 'eval_samples_per_second': 43.761,
 'eval_steps_per_second': 10.94,
 'epoch': 2.0}

In [11]:
trainer.save_model(os.path.join(MODEL_DIR, "hf_clf"))

### Ensemble

In [12]:
train = train_df.assign(logits=trainer.predict(train_ds).predictions[:,0])
val = val_df.assign(logits=trainer.predict(val_ds).predictions[:,0])
test = test_df.assign(logits=trainer.predict(test_ds).predictions[:,0])

train_pool = Pool(
    train[["text", "logits"]], 
    train.label, 
    text_features=["text"],
)
val_pool = Pool(
    val[["text", "logits"]],
    val.label, 
    text_features=["text"],
)
test_pool = Pool(
    test[["text", "logits"]],
    test.label, 
    text_features=["text"],
)
clf = CatBoostClassifier(
    eval_metric="Accuracy", 
    task_type="GPU",
    random_seed=SEED, 
)
clf.fit(train_pool, eval_set=val_pool, verbose=100)    
clf.score(test_pool)

Learning rate set to 0.0552
0:	learn: 0.9197000	test: 0.9072000	best: 0.9072000 (0)	total: 35.6ms	remaining: 35.5s
100:	learn: 0.9280500	test: 0.9154000	best: 0.9158000 (78)	total: 3.08s	remaining: 27.5s
200:	learn: 0.9317000	test: 0.9150000	best: 0.9162000 (106)	total: 6.23s	remaining: 24.8s
300:	learn: 0.9351000	test: 0.9158000	best: 0.9162000 (106)	total: 9.24s	remaining: 21.5s
400:	learn: 0.9374000	test: 0.9160000	best: 0.9166000 (380)	total: 12.3s	remaining: 18.3s
500:	learn: 0.9406000	test: 0.9168000	best: 0.9170000 (490)	total: 15.2s	remaining: 15.2s
600:	learn: 0.9431500	test: 0.9172000	best: 0.9174000 (589)	total: 18.2s	remaining: 12.1s
700:	learn: 0.9454000	test: 0.9170000	best: 0.9178000 (632)	total: 21.2s	remaining: 9.05s
800:	learn: 0.9470500	test: 0.9170000	best: 0.9178000 (632)	total: 24.2s	remaining: 6.02s
900:	learn: 0.9492500	test: 0.9176000	best: 0.9178000 (632)	total: 27.2s	remaining: 2.99s
999:	learn: 0.9509000	test: 0.9172000	best: 0.9178000 (632)	total: 30.2s	rem

0.90568

In [13]:
with open(os.path.join(MODEL_DIR, "cb_clf.pkl"), "wb") as f:
    pickle.dump(clf, f)

## Regressor

In [14]:
checkpoint = os.path.join(MODEL_DIR, "hf_clf")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

with open(os.path.join(MODEL_DIR, "cb_clf.pkl"), "rb") as f:
    clf = pickle.load(f)

def predict_proba(ds):
    return ds.to_pandas().drop(["input_ids", "attention_mask"], axis=1) \
    .assign(logits=predict(model, ds)[0][:,0]) \
    .assign(proba=lambda x: clf.predict_proba(x)[:,0])

train = predict_proba(train_ds)
val = predict_proba(val_ds)
test = predict_proba(test_ds)

train_pool = Pool(
    train.drop(["label", "rating"], axis=1), 
    train.rating, 
    text_features=["text"]
)
val_pool = Pool(
    val.drop(["label", "rating"], axis=1), 
    val.rating, 
    text_features=["text"]
)
test_pool = Pool(
    test.drop(["label", "rating"], axis=1), 
    test.rating, 
    text_features=["text"]
)
reg = CatBoostRegressor(
    objective='RMSE', 
    task_type="GPU",
    random_seed=SEED, 
)
reg.fit(train_pool, eval_set=val_pool, verbose=100)    
reg.score(test_pool)

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/6250 [00:00<?, ?it/s]



Learning rate set to 0.085827
0:	learn: 3.2400829	test: 3.2300987	best: 3.2300987 (0)	total: 20.6ms	remaining: 20.6s
100:	learn: 1.3818951	test: 1.6833442	best: 1.6833442 (100)	total: 1.69s	remaining: 15s
200:	learn: 1.3375295	test: 1.6800193	best: 1.6798837 (197)	total: 3.3s	remaining: 13.1s
300:	learn: 1.3110882	test: 1.6783720	best: 1.6780603 (275)	total: 4.85s	remaining: 11.3s
400:	learn: 1.2901631	test: 1.6789310	best: 1.6779108 (313)	total: 6.4s	remaining: 9.56s
500:	learn: 1.2687693	test: 1.6798937	best: 1.6779108 (313)	total: 7.98s	remaining: 7.94s
600:	learn: 1.2492149	test: 1.6804701	best: 1.6779108 (313)	total: 9.52s	remaining: 6.32s
700:	learn: 1.2317313	test: 1.6816909	best: 1.6779108 (313)	total: 11.1s	remaining: 4.74s
800:	learn: 1.2173295	test: 1.6808316	best: 1.6779108 (313)	total: 12.7s	remaining: 3.14s
900:	learn: 1.2034216	test: 1.6829679	best: 1.6779108 (313)	total: 14.2s	remaining: 1.56s
999:	learn: 1.1914413	test: 1.6824874	best: 1.6779108 (313)	total: 15.7s	rema

0.7144155279856985

In [15]:
with open(os.path.join(MODEL_DIR, "cb_reg.pkl"), "wb") as f:
    pickle.dump(reg, f)

## Prediction

In [1]:
def predict(text):
    import os
    from catboost.text_processing import Tokenizer
    import nltk
    from nltk.corpus import stopwords, wordnet
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import pickle
    import pandas as pd

    MODEL_DIR = "../models"
    
    cb_tokenizer = Tokenizer(lowercasing=True, separator_type='BySense', token_types=['Word', 'Number'])
    stop_words = stopwords.words("english")
    lemmatizer = nltk.stem.WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(token) for token in cb_tokenizer.tokenize(text) if token not in stop_words])
    
    hf_tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb", clean_up_tokenization_spaces=True)
    inputs = hf_tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    
    checkpoint = os.path.join(MODEL_DIR, "hf_clf")
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    model.eval()
    logits = model(inputs["input_ids"], inputs["attention_mask"]).logits[:,0].detach().numpy()
    
    with open(os.path.join(MODEL_DIR, "cb_clf.pkl"), "rb") as f:
        clf = pickle.load(f)
    with open(os.path.join(MODEL_DIR, "cb_reg.pkl"), "rb") as f:
        reg = pickle.load(f)

    df = pd.DataFrame({"text": text, "logits": logits}).assign(proba=lambda x: clf.predict_proba(x)[:,0])
    
    return clf.predict(df)[0], round(reg.predict(df)[0], 0).astype("int")

In [2]:
predict("""
I went and saw this movie last night after being coaxed to by a few friends of mine. 
I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. 
Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. 
The sign of a good movie is that it can toy with our emotions. This one did exactly that. 
The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. 
While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. 
This movie was great, and I suggest that you go see it before you judge.
""")

('pos', 8)