In [2]:
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizerFast,TFAutoModel
import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import datetime

In [2]:
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

GPU is available


In [3]:
dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers","footers", "quotes"))
news_text = dataset.data
target_names=dataset.target_names
labels = dataset.target
(train_texts,test_texts,train_labels,test_labels)=train_test_split(news_text, labels, test_size=0.2)
(train_texts,cv_texts,train_labels,cv_labels)=train_test_split(train_texts, train_labels, test_size=0.2)


In [4]:
train_df = pd.DataFrame(data=np.array([train_texts,train_labels]).T,columns=['text','labels'])
cv_df = pd.DataFrame(data=np.array([cv_texts,cv_labels]).T,columns=['text','labels'])
test_df = pd.DataFrame(data=np.array([test_texts,test_labels]).T,columns=['text','labels'])

In [5]:
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# Load the pre-trained BERT model
model = TFAutoModel.from_pretrained('bert-base-uncased')

2023-03-01 23:31:35.793340: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-01 23:31:35.793548: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [6]:
def bert_feature_exrtaction(text, max_length=512, batch_size=512):
    encoded = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded = [e[:max_length] + [0] * (max_length - len(e[:max_length])) for e in encoded]
    result =[]
    i = 0
    print("Bert feature extraction")
    while i < len(padded):
        print(f'{i} - {len(padded)}')
        input_ids = tf.constant(padded[i:min(i+batch_size,len(padded))])
        outputs = model(input_ids,output_hidden_states=False)
        i += batch_size
        result += list(outputs[0].numpy().mean(axis=1))
    return result

In [7]:
train_df['bert_encoding'] = bert_feature_exrtaction(train_df['text'].values)
cv_df['bert_encoding'] = bert_feature_exrtaction(cv_df['text'].values)
test_df['bert_encoding'] = bert_feature_exrtaction(test_df['text'].values)

Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors


Bert feature extraction
0 - 12060
512 - 12060
1024 - 12060
1536 - 12060
2048 - 12060
2560 - 12060
3072 - 12060
3584 - 12060
4096 - 12060
4608 - 12060
5120 - 12060
5632 - 12060
6144 - 12060
6656 - 12060
7168 - 12060
7680 - 12060
8192 - 12060
8704 - 12060
9216 - 12060
9728 - 12060
10240 - 12060
10752 - 12060
11264 - 12060
11776 - 12060
Bert feature extraction
0 - 3016
512 - 3016
1024 - 3016
1536 - 3016
2048 - 3016
2560 - 3016
Bert feature extraction
0 - 3770
512 - 3770
1024 - 3770
1536 - 3770
2048 - 3770
2560 - 3770
3072 - 3770
3584 - 3770


In [3]:
# train_df.to_parquet('../data/train_df_bert')
# cv_df.to_parquet('../data/cv_df_bert')
# test_df.to_parquet('../data/test_df_bert')
train_df = pd.read_parquet('../data/train_df_bert')
cv_df = pd.read_parquet('../data/cv_df_bert')
test_df = pd.read_parquet('../data/test_df_bert')

In [4]:
vectorizer = TfidfVectorizer(max_features=512)
vectorizer.fit(train_df['text'])

In [5]:
train_df['tfidf_encoding'] = list(vectorizer.transform(train_df['text']).toarray())
cv_df['tfidf_encoding'] = list(vectorizer.transform(cv_df['text']).toarray())
test_df['tfidf_encoding'] = list(vectorizer.transform(test_df['text']).toarray())

In [6]:
def gen_rf_hyperparameters():
    n_estimators = max(int(10 ** (np.random.random() * 1.5 + 1)), 1)
    max_depth = np.random.choice([1, 2, 5, 10, 20])
    min_samples_leaf = np.random.randint(1, 100)
    max_features = np.random.random()
    return {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "min_samples_leaf": min_samples_leaf,
        "max_features": max_features,
    }

In [7]:
n_cv = 50
score = 0
final_params = None
for i in range(n_cv):
    if i % 5 == 0:
        print(
            f"Iteration {i+1} - {n_cv}: CV - {datetime.datetime.today().strftime('%H:%M:%S')}"
        )
    params = gen_rf_hyperparameters()
    model = RandomForestClassifier(**params)
    cv_score = np.mean(
        cross_val_score(
            model,
            list(pd.concat([train_df,cv_df])['bert_encoding']),
            list(pd.concat([train_df,cv_df])['labels']),
            cv=4,
        )
    )
    if cv_score > score:
        score = cv_score
        final_params = params
        print(f"Iter {i+1} - Accuracy :{score}")

Iteration 1 - 50: CV - 08:26:41
Iter 0 - Accuracy :0.22983550013266119
Iter 2 - Accuracy :0.2630671265587689
Iteration 6 - 50: CV - 08:38:55
Iter 8 - Accuracy :0.2893340408596445
Iteration 11 - 50: CV - 08:56:38
Iteration 16 - 50: CV - 09:14:08
Iter 16 - Accuracy :0.3082382594852746
Iteration 21 - 50: CV - 09:52:43
Iter 24 - Accuracy :0.3165295834438843
Iteration 26 - 50: CV - 11:08:09
Iteration 31 - 50: CV - 11:19:43
Iteration 36 - 50: CV - 12:15:44
Iteration 41 - 50: CV - 12:31:39
Iteration 46 - 50: CV - 13:55:08


In [8]:
n_cv = 40
score = 0
final_params = None
for i in range(n_cv):
    if i % 5 == 0:
        print(
            f"Iteration {i+1} - {n_cv}: CV - {datetime.datetime.today().strftime('%H:%M:%S')}"
        )
    params = gen_rf_hyperparameters()
    model = RandomForestClassifier(**params)
    cv_score = np.mean(
        cross_val_score(
            model,
            list(pd.concat([train_df,cv_df])['tfidf_encoding']),
            list(pd.concat([train_df,cv_df])['labels']),
            cv=4,
        )
    )
    if cv_score > score:
        score = cv_score
        final_params = params
        print(f"Iter {i+1} - Accuracy :{score}")

Iteration 1 - 40: CV - 15:38:39
Iter 1 - Accuracy :0.3111568055187052
Iter 3 - Accuracy :0.31573361634385777
Iter 5 - Accuracy :0.3202440965773415
Iteration 6 - 40: CV - 15:47:35
Iteration 11 - 40: CV - 15:49:10
Iteration 16 - 40: CV - 15:51:08
Iter 17 - Accuracy :0.32110639426903687
Iteration 21 - 40: CV - 15:54:04
Iteration 26 - 40: CV - 16:01:12
Iter 27 - Accuracy :0.33722472804457415
Iteration 31 - 40: CV - 16:05:36
Iter 31 - Accuracy :0.35321040063677367
Iter 34 - Accuracy :0.3570575749535686
Iteration 36 - 40: CV - 16:07:53
Iter 37 - Accuracy :0.35845051737861505


# OpenAI

In [25]:
import yaml
import openai
import time

In [6]:

with open("../secret.yml", 'r') as stream:
    data_loaded = yaml.safe_load(stream)

In [8]:
openai.api_key = data_loaded['openai_api_key']

In [26]:
def embed_ada_2(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    time.sleep(0.8)
    return response['data'][0]['embedding']

In [27]:
embeddings_train = [embed_ada_2(t.replace('\n','').replace('\t','')) for t in train_df['text'].iloc[:50]]

RateLimitError: Rate limit reached for default-global-with-image-limits in organization org-ZB6cyYEai12iCDeIyVPJqBcV on requests per min. Limit: 60 / min. Current: 70 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

In [None]:
embeddings_train