In [1]:
from random import shuffle

import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

path_to_data = "data//"
path_to_NLP = "NLP//"

ModuleNotFoundError: No module named 'gensim'

In [4]:
# Load data:

def read_data(fname):
    with open(fname, "r", encoding="UTF-8") as f:
        data = [line.rstrip().split(' ') for line in f.readlines()]
    df = pd.DataFrame(data=data)
    if len(df.columns) == 2:
        df.columns = ['x', 'y']
        for col in df.columns:
            df[col] = df[col].astype('float32')
    else:
        for col in df.columns:
            df[col] = df[col].astype('int64')
    return df


# Locations:
# fn = "R2192_1x200_at5_step200_bin100-RAW_pos.dat"
# locations = read_data("R2192_pos.txt")
locations = read_data(path_to_data + "R2192_1x200_at5_step200_bin100-RAW_pos.dat")

# Spikes:
fn = "R2192_20ms_63_neurons.csv"
df = pd.read_csv(path_to_data + fn, encoding="UTF-8", header=None)  # 63 ows, 54100 cols
df = df.T

## Converting spikes to sentences

In [7]:
# Letters:
letters = [str(x) for x in range(63)]
df.columns = letters

np.max(np.max(df))  # max spikes in 20ms = 4

4

In [8]:
# Convert spikes in each timestep into a word, do this for all timesteps:

start = 0
end = 10
sents = []

for i in range(5410):
    sent_rows = df.iloc[start:end]
    sent_words = []
    prev_pauses = 0

    # One sentence (10 words):
    for j in range(start, end):

        # all spikes in 20ms:
        row = df.iloc[j]

        # No spikes:
        if np.sum(row) == 0:
            if prev_pauses != 2:
                prev_pauses += 1
                sent_words.append('_')

        # Spikes:
        else:
            one_spike = row[row == 1].index.tolist()  # ['t']
            two_spikes = row[row == 2].index.tolist()
            three_spikes = row[row == 3].index.tolist()
            four_spikes = row[row == 4].index.tolist()
            word = 4 * four_spikes + 3 * three_spikes + 2 * two_spikes + one_spike
            # word = "".join(row)
            shuffle(word)
            sent_words += word

            if prev_pauses != 0:
                prev_pauses = 0

    sents.append(sent_words)
    start += 10
    end += 10

In [None]:
# # MAYBE: Replace two pauses ('_') with one, and one with no pause.

# sents2 = []
# for sent in sents:
#     pauses = 0
#     new_sent = []
#     for word in sent:
#         if word=="_":
#             if pauses==0:
#                 pauses+=1
#             elif pauses==1:
#                 new_sent.append(word)
#                 pauses = 0
#         else:
#             new_sent.append(word)
#             pauses = 0
#     sents2.append(new_sent)

# sents = sents2  # result was pretty much the same as with sents

# TODO: (MAYBE) Remove all pauses ('_')

# TODO: (MAYBE): Use also sentences béginning at midpoint of the 200ms interval and ending at midpoint 
# of next 200ms interval. (Would help to use data in more uniform way, not discriminating 
# the last / first 20ms internvals in a sentence - the splits into sentences are actually artificial...).

## Word2vec

In [9]:
VECTOR_LEN = 250
WINDOW_SIZE = 5  # 5
skipgram = 0
# batch_size= 100 # (No. of words)
model = Word2Vec(min_count=1, vector_size=VECTOR_LEN, window=WINDOW_SIZE, max_vocab_size=None, max_final_vocab=None,
                 sg=skipgram, compute_loss=True)  # workers = workers, batch_words=batch_size

# Train / test: 
train_sents = sents[:4400]
test_sents = sents[4400:]

# Build vocabulary:
model.build_vocab(train_sents)

sent_counts = len(train_sents)
EPOCHS = 20
model.train(corpus_iterable=train_sents, total_examples=sent_counts, epochs=EPOCHS,
            compute_loss=True)  # , callbacks=[my_callback(eval_fragments, train_fragments, model, 
# EPOCHS, log_string, model_path, model_name, path_to_w2v, s)]

(236758, 903320)

### Most similar neurons

In [10]:
# Compare most similar neurons: (see png plots of spike locations to check if 
# receptive fields are indeed similar!)
similar_neurons = []
for neuron in letters:
    sims = model.wv.most_similar([neuron], topn=5)
    similar_neurons.append([x for x, y in sims])

similar_df = pd.DataFrame(data=similar_neurons).T

# Save to csv:
similar_df.to_csv(path_to_NLP + "most_similar.csv", encoding="UTF-8", index=False)

# --> Plot with plotting_neurons.ipynb

# ->Seems like neurons with similar receptive fields indeed have similar vectors. 
# (Compared a few that have quite well defined receptive fields.)  

### Make sentence vectors for 200ms

In [11]:
# Average the neuron vectors to get sentence vectors:
averaged_vecs = []  # both train and test
for sent in sents:
    vecs = [model.wv[code] for code in sent]
    a = np.array(vecs)
    # summed_vecs.append(np.sum(a, axis = 0))
    averaged_vecs.append(np.mean(a, axis=0))

train_vecs = averaged_vecs[:4400]
test_vecs = averaged_vecs[4400:]

df_train = pd.DataFrame(data=train_vecs)
df_test = pd.DataFrame(data=test_vecs)

train_y = locations[:4400]
test_y = locations[4400:]

## Classification models using word vectors

In order to predict two output variables, we need to use Multioutput regression. 

From sklearn ( https://docs.w3cub.com/scikit_learn/modules/multiclass): _"Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target."_

### Linear regression

In [12]:
reg_model = MultiOutputRegressor(LinearRegression())
# reg_model = MultiOutputRegressor(RandomForestRegressor(min_samples_leaf=3)) 

reg_model = reg_model.fit(df_train, train_y)
preds = reg_model.predict(df_test)
preds = preds.T

# Distance between predicted and actual location:
dists = np.sqrt((test_y['x'] - preds[0]) ** 2 + (test_y['y'] - preds[1]) ** 2)
avg_dist = np.mean(dists)
print(avg_dist)

29.050526117748234


### Random Forest Regressor

In [13]:
reg_model = MultiOutputRegressor(RandomForestRegressor(min_samples_leaf=3))

reg_model = reg_model.fit(df_train, train_y)
preds = reg_model.predict(df_test)
preds = preds.T

# Distance between predicted and actual location:
dists = np.sqrt((test_y['x'] - preds[0]) ** 2 + (test_y['y'] - preds[1]) ** 2)
avg_dist = np.mean(dists)
print(avg_dist)

26.94782955428862
