In [None]:
!pip install nltk openai tqdm pandas numpy matplotlib librosa soundfile plotly

In [1]:
from nltk.corpus import reuters
import openai
from tqdm import tqdm
import numpy as np
import nltk
from openai.embeddings_utils import get_embeddings
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import librosa

with open('/users/jasper/oai.txt', 'r') as f:
    openai.api_key = f.read()

# nltk.download('reuters')

trade_docs = reuters.fileids(categories='trade')
crude_docs = reuters.fileids(categories='crude')

all_docs = [reuters.raw(doc_id) for doc_id in trade_docs + crude_docs]
all_labels = ['trade' for _ in trade_docs] + ['crude' for _ in crude_docs]

# shuffle docs and labels together
np.random.seed(42)
combined = list(zip(all_docs, all_labels))
np.random.shuffle(combined)
all_docs, all_labels = zip(*combined)

In [9]:
# get embeddings for train/test docs
print('Getting embeddings...')
embeddings_engine = "text-embedding-ada-002"
all_embeddings = get_embeddings(all_docs, engine=embeddings_engine)

# pickle embeddings
with open('embeddings/all_embeddings.pkl', 'wb') as f:
    pickle.dump(all_embeddings, f)

Getting embeddings...


In [10]:
# load embeddings
with open('embeddings/all_embeddings.pkl', 'rb') as f:
    all_embeddings = pickle.load(f)


In [11]:
# vanilla classification

# train/test split
X_train, X_test, y_train, y_test = train_test_split(all_embeddings, all_labels, test_size=0.2, random_state=42)

# train classifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)
clf.fit(X_train, y_train)

# predict on test set
y_pred = clf.predict(X_test)

# evaluate
print(accuracy_score(y_test, y_pred))

Iteration 1, loss = 0.70101367
Iteration 2, loss = 0.68367065
Iteration 3, loss = 0.68158052
Iteration 4, loss = 0.66672913
Iteration 5, loss = 0.64261986
Iteration 6, loss = 0.59763393
Iteration 7, loss = 0.50344354
Iteration 8, loss = 0.34982035
Iteration 9, loss = 0.18737424
Iteration 10, loss = 0.09429573
Iteration 11, loss = 0.06038294
Iteration 12, loss = 0.04866964
Iteration 13, loss = 0.04410997
Iteration 14, loss = 0.04180204
Iteration 15, loss = 0.04022787
Iteration 16, loss = 0.03893421
Iteration 17, loss = 0.03733961
Iteration 18, loss = 0.03657173
Iteration 19, loss = 0.03535737
Iteration 20, loss = 0.03205988
Iteration 21, loss = 0.03007697
Iteration 22, loss = 0.03237719
Iteration 23, loss = 0.03248144
Iteration 24, loss = 0.03481693
Iteration 25, loss = 0.02949732
Iteration 26, loss = 0.02617889
Iteration 27, loss = 0.02856872
Iteration 28, loss = 0.02560701
Iteration 29, loss = 0.02881651
Iteration 30, loss = 0.02308987
Iteration 31, loss = 0.02350713
Iteration 32, los

In [3]:
# fft classification with sliding windows

from split_utils import split_text

all_docs_paras = [split_text(doc, segment_length=40) for doc in all_docs]

# remove any empty paragraphs
all_docs_paras = [[para for para in paras if para] for paras in all_docs_paras]
# remove any '' paragraphs
all_docs_paras = [[para for para in paras if para != ''] for paras in all_docs_paras]

In [4]:
# get embeddings for each paragraph
print('Getting embeddings...')
embeddings_engine = "text-embedding-ada-002"
train_embeddings_paras = [get_embeddings(paras, engine=embeddings_engine) for paras in tqdm(all_docs_paras)]

# pickle embeddings
with open('embeddings/all_embeddings_paras.pkl', 'wb') as f:
    pickle.dump(train_embeddings_paras, f)

Getting embeddings...


100%|██████████| 1063/1063 [02:51<00:00,  6.19it/s]


In [10]:
# load embeddings
with open('embeddings/all_embeddings_paras.pkl', 'rb') as f:
    all_embeddings_paras = pickle.load(f)

# convert to numpy arrays
all_embeddings_paras = [np.array(doc) for doc in all_embeddings_paras]

# get FFTs
def get_fft(embedding):
    return librosa.stft(embedding, n_fft=32, win_length=4)

# lowpass filter
def lowpass_filter(fft, cutoff=0.5):
    """
    Lowpass filter for FFTs
    """
    fft = fft.copy()
    fft[:, int(cutoff*fft.shape[1]):] = 0
    return fft

# convert back to embeddings
def fft_to_embedding(fft):
    return librosa.istft(fft, win_length=4)

apply_lowpass = True

# get FFTs
all_embeddings_paras_fft = [get_fft(embedding) for embedding in tqdm(all_embeddings_paras)]

if apply_lowpass:
    # lowpass filter
    all_embeddings_paras_fft = [lowpass_filter(fft) for fft in tqdm(all_embeddings_paras_fft)]

# convert back to embeddings
all_embeddings_paras_lowpass = [fft_to_embedding(fft) for fft in tqdm(all_embeddings_paras_fft)]

if not apply_lowpass:
    # assert that the embeddings are the same if lowpass filtering is not applied
    assert np.allclose(all_embeddings_paras_lowpass[0], all_embeddings_paras[0])

# average embeddings
train_embeddings_lowpass_avg = [np.mean(embeddings, axis=0) for embeddings in all_embeddings_paras_lowpass]


100%|██████████| 1063/1063 [00:03<00:00, 294.35it/s]
100%|██████████| 1063/1063 [00:21<00:00, 49.71it/s]
100%|██████████| 1063/1063 [00:08<00:00, 123.04it/s]


In [11]:
# train/test split

X_train, X_test, y_train, y_test = train_test_split(train_embeddings_lowpass_avg, all_labels, test_size=0.2, random_state=42)

# train classifier
clf2 = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1)
clf2.fit(X_train, y_train)

# predict on test set
y_pred = clf2.predict(X_test)

# evaluate
print(accuracy_score(y_test, y_pred))

Iteration 1, loss = 0.70160264
Iteration 2, loss = 0.68771710
Iteration 3, loss = 0.69109112
Iteration 4, loss = 0.68691152
Iteration 5, loss = 0.68390527
Iteration 6, loss = 0.68228892
Iteration 7, loss = 0.67890381
Iteration 8, loss = 0.67186771
Iteration 9, loss = 0.66549705
Iteration 10, loss = 0.65580546
Iteration 11, loss = 0.63752351
Iteration 12, loss = 0.61125691
Iteration 13, loss = 0.57271323
Iteration 14, loss = 0.51937137
Iteration 15, loss = 0.42569202
Iteration 16, loss = 0.33487200
Iteration 17, loss = 0.58026144
Iteration 18, loss = 0.59603981
Iteration 19, loss = 0.15659298
Iteration 20, loss = 0.12407522
Iteration 21, loss = 0.09675120
Iteration 22, loss = 0.08256976
Iteration 23, loss = 0.08335760
Iteration 24, loss = 0.09011859
Iteration 25, loss = 0.07379848
Iteration 26, loss = 0.05833198
Iteration 27, loss = 0.05842540
Iteration 28, loss = 0.05186314
Iteration 29, loss = 0.05113259
Iteration 30, loss = 0.05009799
Iteration 31, loss = 0.04764648
Iteration 32, los

# results

- get embeddings for whole text: 97.1% accuracy
- sliding window without lowpass filter: 96% accuracy
- sliding window with lowpass filter: 97.6% accuracy