In [16]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np

from tensor_lda.tensor_lda import TensorLDA

import scipy
import scipy.sparse as sparse
from sklearn.preprocessing import normalize 
import pickle

In [21]:
# import dataset
file_path = "../DATA/GSE131928_RAW/GSM3828672_Smartseq2_GBM_IDHwt_processed_TPM.tsv"
df = pd.read_csv(file_path, sep='\t').transpose()

# extract gene names and expression values
gene_names = df.values[0]
exp_cell_normalized = np.round(normalize(df.values[1:],axis=0,norm='max')*100)

In [42]:
var = exp_cell_normalized.var(axis = 0)

num_genes_list = [500,1000,1500]

num_topics = [3,5,7,9,11]

for n_genes in num_genes_list:
    ind = np.argpartition(var, -n_genes)[-n_genes:]

    # print("done in %0.3fs." % (time() - t0))

    # print("Constructing sparse matrix...")
    # t0=time()

    tf = sparse.csr_matrix(exp_cell_normalized[:,ind].astype(float))
    tf_feature_names = gene_names[ind]

    with open("results/filtered_gene%d_exp.pkl"%(n_genes), 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump([tf,tf_feature_names], f,-1)


In [22]:
# Keep the top 2000 genes with the highest variance
var = exp_cell_normalized.var(axis = 0)
ind = np.argpartition(var, -1500)[-1500:]
tf = sparse.csr_matrix(exp_cell_normalized[:,ind].astype(float))
tf_feature_names = gene_names[ind]

In [38]:
with open("filtered_1500_gene_dataset.pkl", 'wb') as f:
    pickle.dump([tf,tf_feature_names], f,-1)


In [40]:
with open('filtered_1500_gene_dataset.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
     [tf,tf_feature_names] = pickle.load(f)

<7930x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 6920086 stored elements in Compressed Sparse Row format>

In [None]:
exp_test

In [None]:

n_samples = tf.shape[0]
n_features = tf.shape[1]
n_components = 4
n_top_words = 20


In [None]:
lda = TensorLDA(n_components=n_components, alpha0=.1)

In [None]:
lda.fit(tf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        topic_prior = model.alpha_[topic_idx]
        message = "Topic #%d (prior: %.3f): " % (topic_idx, topic_prior)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=2,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=5,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

print("Fitting TensorLDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

lda = TensorLDA(n_components=n_components, alpha0=.1)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

doc_topics = lda.transform(tf[0:2, :])
print(doc_topics[0, :])
print(data_samples[0])

In [5]:
with open('lda_model.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
     lda = pickle.load(f)


In [23]:
lda.transform(tf).argmax(axis=1))

In [44]:
# Specify the hyperparameters
num_genes_list = [500,1000,1500]
num_topics = [3,5,7,9,11]
predictions = []
# load the result for each pair of hyperparameters and do transformation
for n_genes in num_genes_list:
    with open("results/filtered_gene%d_exp.pkl"%(n_genes), 'rb') as f:  # Python 3: open(..., 'rb')
        [tf,tf_feature_names] = pickle.load(f)
    for n_components in num_topics:
        with open("results/lda_model_topic%d_gene%d.pkl"%(n_components, n_genes), 'rb') as f:
            lda = pickle.load(f)
        predictions.append(lda.transform(tf).argmax(axis=1))


In [46]:
with open("results/predictions.pkl", 'wb') as f:
    pickle.dump(predictions, f,-1)


In [49]:
print(predictions[14])


[ 9 10 10 ... 10 10 10]
