In [1]:
from random import shuffle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

from gensim.models.word2vec import Word2Vec  # version 4.0.1 surely works (the one preinstalled in colab might not)

# GENSIM API: https://radimrehurek.com/gensim/models/word2vec.html 

path_to_data = "data//"
path_to_NLP = "NLP//"

In [3]:
# Load data:

def read_data(fname):
    with open(fname, "r", encoding="UTF-8") as f:
        data = [line.rstrip().split(' ') for line in f.readlines()]
    df = pd.DataFrame(data=data)
    if len(df.columns) == 2:
        df.columns = ['x', 'y']
        for col in df.columns:
            df[col] = df[col].astype('float32')
    else:
        for col in df.columns:
            df[col] = df[col].astype('int64')
    return df


# Locations:
locations = read_data(path_to_data + "R2192_1x200_at5_step200_bin100-RAW_pos.dat")

# Spikes:
fn = "R2192_20ms_63_neurons.csv"
df = pd.read_csv(path_to_data + fn, encoding="UTF-8", header=None)  # 63 ows, 54100 cols
df = df.T
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
locations.head()

Unnamed: 0,x,y
0,51.719799,50.123798
1,52.901798,49.8703
2,53.595901,47.411301
3,47.9832,41.520599
4,38.970699,34.953899


## Converting spikes to sentences

In [6]:
# Neurons:
neurons = [str(x) for x in range(63)]
df.columns = neurons

np.max(np.max(df))  # max spikes in 20ms = 4

4

In [8]:
# Convert spikes in each timestep into a word, do this for all timesteps:

start = 0
end = 10
sents = []

for i in range(5410):  # 5410 timesteps of 200ms
    sent_rows = df.iloc[start:end] # 10 consequtive rows, each representing activity in 20ms
    sent_words = []
    prev_pauses = 0

    # One sentence (10 words), i.e. activity in 10*20ms = 200ms:
    for j in range(start, end):

        # all spikes in 20ms:
        row = df.iloc[j]

        # If there were no spikes in 20ms, mark pause with "_":
        if np.sum(row) == 0:
            if prev_pauses != 2:
                prev_pauses += 1
                sent_words.append('_')

        else:
            word = []
            max_spikes = np.max(row)
            for s in range(1, max_spikes+1):
                word+=s*row[row == s].index.tolist()  # if same neuron spiked e.g. 3 times, we add its id 3 times.
            shuffle(word) # shuffle ids of neurons that spiked in same 20ms interval
            sent_words += word

            if prev_pauses != 0:
                prev_pauses = 0

    sents.append(sent_words)
    start += 10
    end += 10

In [None]:
# TODO: (MAYBE) Try replacing two pauses ('_') with one, and one with no pause. 
# Or try removing all pauses ('_'), or putting all pauses back. 
# Or try different max number of consequtive pauses. 

# TODO: (MAYBE): Use longer sentences for training word2vec (so that the moving window could cover
# all text equally). Or use also sentences beginning at midpoint of the 200ms interval and ending at midpoint 
# of next 200ms interval for training word2vec. (Would help to use data in more uniform way, not discriminating 
# the last / first 20ms internvals in a sentence - the splits into sentences are actually artificial...).
# If we use longer sentences, or also sentences starting at midpoint of 200ms interval, for training word2vec,
# we should still use only the current version of sentences (i.e. exactly corresponding to each 200ms interval) 
# for making sentence vectors (see below) to be used as input to a regression model. 

In [14]:
len(sents)

5410

In [18]:
sents[1]

['_', '_', '51', '51', '_', '_', '55', '49', '_', '35', '4', '35', '56', '4']

## Word2vec

In [9]:
VECTOR_LEN = 250
WINDOW_SIZE = 5  
skipgram = 0
# batch_size= 100 # (No. of words)
model = Word2Vec(min_count=1, vector_size=VECTOR_LEN, window=WINDOW_SIZE, max_vocab_size=None, max_final_vocab=None,
                 sg=skipgram, compute_loss=True)  # batch_words=batch_size

# Train / test: 
train_sents = sents[:4400]
test_sents = sents[4400:]

# Build vocabulary:
model.build_vocab(train_sents)

sent_counts = len(train_sents)
EPOCHS = 20
model.train(corpus_iterable=train_sents, total_examples=sent_counts, epochs=EPOCHS,
            compute_loss=True)  

(236623, 903320)

### Most similar neurons

In [10]:
# Compare most similar neurons: (see png plots of spike locations to check if 
# receptive fields are indeed similar!)
similar_neurons = []
for neuron in neurons:
    sims = model.wv.most_similar([neuron], topn=5)
    similar_neurons.append([x for x, y in sims])

similar_df = pd.DataFrame(data=similar_neurons).T

# Save to csv:
similar_df.to_csv(path_to_NLP + "most_similar.csv", encoding="UTF-8", index=False)

# --> Plot with plotting_neurons.ipynb

# ->Seems like neurons with similar receptive fields indeed have similar vectors. 
# (Compared a few that have quite well defined receptive fields.)  

### Make sentence vectors for 200ms

In [11]:
# Average the neuron vectors to get sentence vectors:
averaged_vecs = []  # both train and test
for sent in sents:
    vecs = [model.wv[code] for code in sent]
    a = np.array(vecs)
    # summed_vecs.append(np.sum(a, axis = 0))
    averaged_vecs.append(np.mean(a, axis=0))

train_vecs = averaged_vecs[:4400]
test_vecs = averaged_vecs[4400:]

df_train = pd.DataFrame(data=train_vecs)
df_test = pd.DataFrame(data=test_vecs)

train_y = locations[:4400]
test_y = locations[4400:]

## Classification models using word vectors

In order to predict two output variables, we need to use Multioutput regression. 

From sklearn ( https://docs.w3cub.com/scikit_learn/modules/multiclass): _"Multioutput regression support can be added to any regressor with MultiOutputRegressor. This strategy consists of fitting one regressor per target."_

### Linear regression

In [12]:
reg_model = MultiOutputRegressor(LinearRegression())
# reg_model = MultiOutputRegressor(RandomForestRegressor(min_samples_leaf=3)) 

reg_model = reg_model.fit(df_train, train_y)
preds = reg_model.predict(df_test)
preds = preds.T

# Distance between predicted and actual location:
dists = np.sqrt((test_y['x'] - preds[0]) ** 2 + (test_y['y'] - preds[1]) ** 2)
avg_dist = np.mean(dists)
print(avg_dist)

28.83855141344368


### Random Forest Regressor

In [13]:
reg_model = MultiOutputRegressor(RandomForestRegressor(min_samples_leaf=3))

reg_model = reg_model.fit(df_train, train_y)
preds = reg_model.predict(df_test)
preds = preds.T

# Distance between predicted and actual location:
dists = np.sqrt((test_y['x'] - preds[0]) ** 2 + (test_y['y'] - preds[1]) ** 2)
avg_dist = np.mean(dists)
print(avg_dist)

26.71159505567115
