# Create test files
Need:
- 10 articles with permission, 5 arbitrary features
- abstracts.csv
- feature_counts.txt
- neurosynth_dataset.pkl
    - This means creating reduced versions of **database.txt** and **features.txt**
- dataset_files/pmids.txt
- dataset_files/peak_indices.txt
- dataset_files/word_labels.txt
- dataset_files/word_indices.txt
- gclda_dataset.pkl
- gclda_model.pkl

Have: 
- continuous.nii.gz
- roi.nii.gz

In [1]:
from os.path import join

import numpy as np
import pandas as pd
import neurosynth
from gclda.dataset import Dataset
from gclda.dataset import import_neurosynth
from gclda.tests.utils import get_test_data_path
from gclda.model import Model

In [2]:
# Constants
in_dir = '/Users/tsalo/Desktop/ns-dataset/'
out_dir = get_test_data_path()

In [3]:
db_file = join(in_dir, 'database.txt')
feat_file = join(in_dir, 'features.txt')
df_db = pd.read_csv(db_file, sep='\t', index_col='id')
df_feat = pd.read_csv(feat_file, sep='\t', index_col='pmid')
pmids = df_db.index.unique()[:10]
features = ['addition', 'analyzed', 'anterior', 'blood', 'conditions']
df_db = df_db.loc[pmids]
df_feat = df_feat.loc[pmids][features]
df_db.to_csv(join(in_dir, 'database_reduced.txt'), sep='\t',
             index_label='id')
df_feat.to_csv(join(in_dir, 'features_reduced.txt'), sep='\t',
               index_label='pmid')

In [4]:
# Neurosynth dataset
dataset = neurosynth.Dataset(join(in_dir, 'database_reduced.txt'),
                             join(in_dir, 'features_reduced.txt'))
dataset.save(join(out_dir, 'neurosynth_dataset.pkl'))

In [5]:
# Counts file
counts_file = join(out_dir, 'feature_counts.txt')
dat = df_feat.values
dat[dat>0] = 1
dat = dat.astype(int)
df_counts = pd.DataFrame(columns=df_feat.columns, index=df_feat.index, data=dat)
df_counts.to_csv(counts_file, sep='\t', index_label='pmid')

In [6]:
# Abstracts file
abstracts = []
for i in range(df_counts.shape[0]):
    row = df_counts.iloc[i]
    string = 'test ' + ' '.join(row[row==1].index.values)
    string = string.strip()
    abstracts.append(string)
df_abstracts = pd.DataFrame(index=df_counts.index, columns=['abstract'],
                            data=abstracts)
df_abstracts.to_csv(join(out_dir, 'abstracts.csv'), index_label='pmid')

In [7]:
ns_dset_file = join(out_dir, 'neurosynth_dataset.pkl')
temp_dir = join(out_dir, 'temp')

ns_dset = neurosynth.Dataset.load(ns_dset_file)
import_neurosynth(ns_dset, 'dataset_files', out_dir=out_dir,
                  counts_file=counts_file)

In [8]:
dataset = Dataset('dataset_files', out_dir)
dataset.save(join(out_dir, 'gclda_dataset.pkl'))

In [9]:
model = Model(dataset, n_topics=2, n_regions=1, symmetric=False,
              alpha=.1, beta=.01, gamma=.01, delta=1.0,
              dobs=25, roi_size=50.0, seed_init=1)
model.initialize()
for i in range(25):
    model.run_complete_iteration()
model.save(join(out_dir, 'gclda_model.pkl'))

Constructing GC-LDA Model
Initializing GC-LDA Model
Iter 0001: Sampling z
Iter 0001: Sampling y|r
Iter 0001: Updating spatial params
Iter 0001: Computing log-likelihood
Iter 0001 Log-likely: x =   -11999.3, w =      -27.4, tot =   -12026.7
Iter 0002: Sampling z
Iter 0002: Sampling y|r
Iter 0002: Updating spatial params
Iter 0002: Computing log-likelihood
Iter 0002 Log-likely: x =   -12019.5, w =      -28.8, tot =   -12048.3
Iter 0003: Sampling z
Iter 0003: Sampling y|r
Iter 0003: Updating spatial params
Iter 0003: Computing log-likelihood
Iter 0003 Log-likely: x =   -12064.6, w =      -28.8, tot =   -12093.4
Iter 0004: Sampling z
Iter 0004: Sampling y|r
Iter 0004: Updating spatial params
Iter 0004: Computing log-likelihood
Iter 0004 Log-likely: x =   -12152.1, w =      -34.1, tot =   -12186.3
Iter 0005: Sampling z
Iter 0005: Sampling y|r
Iter 0005: Updating spatial params
Iter 0005: Computing log-likelihood
Iter 0005 Log-likely: x =   -12349.0, w =      -39.5, tot =   -12388.5
Iter 000