In [1]:
from nltk.corpus import reuters
import openai
from tqdm import tqdm
import numpy as np
import nltk
from openai.embeddings_utils import get_embeddings
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import librosa

with open('/users/jasper/oai.txt', 'r') as f:
    openai.api_key = f.read()

# nltk.download('reuters')

In [2]:
# perform train/test split
train_docs_id = reuters.fileids(categories='trade')
test_docs_id = reuters.fileids(categories='crude')

# get train/test docs
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# get train/test labels
train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs_id]
test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs_id]

all_docs = train_docs + test_docs
all_labels = train_labels + test_labels


In [15]:
# get embeddings for train/test docs
print('Getting embeddings...')
embeddings_engine = "text-embedding-ada-002"
train_embeddings = get_embeddings(train_docs, engine=embeddings_engine)
test_embeddings = get_embeddings(test_docs, engine=embeddings_engine)

# pickle embeddings
with open('embeddings/train_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)
with open('embeddings/test_embeddings.pkl', 'wb') as f:
    pickle.dump(test_embeddings, f)

Getting embeddings...


In [3]:
# load embeddings
with open('embeddings/train_embeddings.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)
with open('embeddings/test_embeddings.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)


In [17]:
# vanilla classification

# train/test split
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_labels, test_size=0.2, random_state=42)

# train classifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)
clf.fit(X_train, y_train)

# predict on test set
y_pred = clf.predict(X_test)

# evaluate
print(accuracy_score(y_test, y_pred))

Iteration 1, loss = 3.35987216
Iteration 2, loss = 2.81982825
Iteration 3, loss = 2.09059460
Iteration 4, loss = 1.59987339
Iteration 5, loss = 1.69334903
Iteration 6, loss = 1.53592467
Iteration 7, loss = 1.42689260
Iteration 8, loss = 1.43523758
Iteration 9, loss = 1.42703349
Iteration 10, loss = 1.40735614
Iteration 11, loss = 1.40058891
Iteration 12, loss = 1.39625255
Iteration 13, loss = 1.39380517
Iteration 14, loss = 1.39057487
Iteration 15, loss = 1.38843592
Iteration 16, loss = 1.38688206
Iteration 17, loss = 1.38468859
Iteration 18, loss = 1.38221461
Iteration 19, loss = 1.38144501
Iteration 20, loss = 1.37866268
Iteration 21, loss = 1.37681112
Iteration 22, loss = 1.37417329
Iteration 23, loss = 1.37216689
Iteration 24, loss = 1.37007989
Iteration 25, loss = 1.36798076
Iteration 26, loss = 1.36531528
Iteration 27, loss = 1.36218793
Iteration 28, loss = 1.35850787
Iteration 29, loss = 1.35423144
Iteration 30, loss = 1.35206544
Iteration 31, loss = 1.34485081
Iteration 32, los

In [10]:
# fft classification with sliding windows

from split_utils import split_text

all_docs_paras = [split_text(doc, segment_length=40) for doc in all_docs]

# remove any empty paragraphs
all_docs_paras = [[para for para in paras if para] for paras in all_docs_paras]
# remove any '' paragraphs
all_docs_paras = [[para for para in paras if para != ''] for paras in all_docs_paras]

In [13]:
# get embeddings for each paragraph
print('Getting embeddings...')
embeddings_engine = "text-embedding-ada-002"
train_embeddings_paras = [get_embeddings(paras, engine=embeddings_engine) for paras in tqdm(all_docs_paras)]

# pickle embeddings
with open('embeddings/all_embeddings_paras.pkl', 'wb') as f:
    pickle.dump(train_embeddings_paras, f)

Getting embeddings...


100%|██████████| 1063/1063 [11:05<00:00,  1.60it/s] 


In [5]:
# load embeddings
with open('embeddings/all_embeddings_paras.pkl', 'rb') as f:
    all_embeddings_paras = pickle.load(f)

# convert to numpy arrays
all_embeddings_paras = [np.array(doc) for doc in all_embeddings_paras]

# get FFTs
def get_fft(embedding):
    return np.abs(librosa.stft(embedding, n_fft=32, win_length=4))

# lowpass filter
def lowpass_filter(fft, cutoff=0.5):
    """
    Lowpass filter for FFTs
    """
    fft = fft.copy()
    fft[:, int(cutoff*fft.shape[1]):] = 0
    return fft

# convert back to embeddings
def fft_to_embedding(fft):
    return librosa.istft(fft, win_length=4)

# get FFTs
all_embeddings_paras_fft = [get_fft(embedding) for embedding in tqdm(all_embeddings_paras)]

# lowpass filter
all_embeddings_paras_fft_lowpass = [lowpass_filter(fft) for fft in tqdm(all_embeddings_paras_fft)]

# convert back to embeddings
all_embeddings_paras_lowpass = [fft_to_embedding(fft) for fft in tqdm(all_embeddings_paras_fft_lowpass)]

# average embeddings
train_embeddings_lowpass_avg = [np.mean(embeddings, axis=0) for embeddings in all_embeddings_paras_lowpass]


100%|██████████| 1063/1063 [00:03<00:00, 273.26it/s]
100%|██████████| 1063/1063 [00:04<00:00, 229.58it/s]


In [6]:
# train/test split

X_train, X_test, y_train, y_test = train_test_split(train_embeddings_lowpass_avg, all_labels, test_size=0.2, random_state=42)

# train classifier
clf2 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)
clf2.fit(X_train, y_train)

# predict on test set
y_pred = clf2.predict(X_test)

# evaluate
print(accuracy_score(y_test, y_pred))

Iteration 1, loss = 2.77268064
Iteration 2, loss = 1.64942340
Iteration 3, loss = 1.51929278
Iteration 4, loss = 1.42771212
Iteration 5, loss = 1.39061975
Iteration 6, loss = 1.38293971
Iteration 7, loss = 1.38056410
Iteration 8, loss = 1.38080339
Iteration 9, loss = 1.37944224
Iteration 10, loss = 1.37860835
Iteration 11, loss = 1.37678588
Iteration 12, loss = 1.37513812
Iteration 13, loss = 1.37406435
Iteration 14, loss = 1.37327178
Iteration 15, loss = 1.37371383
Iteration 16, loss = 1.37490806
Iteration 17, loss = 1.37811615
Iteration 18, loss = 1.37555660
Iteration 19, loss = 1.37564448
Iteration 20, loss = 1.37436271
Iteration 21, loss = 1.37639790
Iteration 22, loss = 1.37664942
Iteration 23, loss = 1.37654497
Iteration 24, loss = 1.37377315
Iteration 25, loss = 1.37302011
Iteration 26, loss = 1.37718225
Iteration 27, loss = 1.37292278
Iteration 28, loss = 1.37401421
Iteration 29, loss = 1.37251498
Iteration 30, loss = 1.37645613
Iteration 31, loss = 1.37729861
Iteration 32, los