# Miscellaneos experiemnts
A place to try stuff out...

In [None]:
import sys
sys.path += ['../Src']

In [None]:
import pandas as pd
import numpy as np

## Ensuring dataframes have identical gene sets

In [None]:
def read_pair_of_expressions_and_intersect(file_1, file_2):
    """Read expression data from both files returning a pair of dataframes having only genes which
    are present in both files"""
    df_1 = pd.read_csv(file_1, sep='\t', index_col=0)
    df_2 = pd.read_csv(file_2, sep='\t', index_col=0)
    
    genes_1 = set(df_1.index)
    genes_2 = set(df_2.index)
    intersection_genes = genes_1.intersection(genes_2)
    
    intersection_df = pd.DataFrame()
    intersection_df['GeneENSG'] = list(intersection_genes)
    intersection_df.set_index('GeneENSG', inplace=True)
    
    df_1_pruned = pd.merge(df_1, intersection_df, how='inner', on='GeneENSG')
    df_2_pruned = pd.merge(df_2, intersection_df, how='inner', on='GeneENSG')
    df_1_pruned.sort_index(inplace=True)
    df_2_pruned.sort_index(inplace=True)
    
    assert len(df_1_pruned) == len(df_2_pruned)
    assert all(df_1_pruned.index.values == df_2_pruned.index.values)
    
    return df_1_pruned, df_2_pruned

In [None]:
expression_filename_1 = '../Data/AOCS_Protein/AOCS_Protein_PrunedExpression.tsv'
expression_filename_2 = '../Data/TCGA_OV_VST/TCGA_OV_VST_PrunedExpression.tsv'

df_1, df_2 = read_pair_of_expressions_and_intersect(expression_filename_1, expression_filename_2)

In [None]:
df_2

## Computing H from W and X

In [None]:
from sklearn.linear_model import LinearRegression
from factorizer_wrappers import ICA_Factorizer

In [None]:
X = np.random.randn(50, 10) + 1.0

In [None]:
facto = ICA_Factorizer(n_components=3,tol=0.01)
facto.fit(X)

In [None]:
W = facto.get_W()
W.shape

In [None]:
H = facto.get_H()
H.shape

In [None]:
np.linalg.pinv(W).shape

In [None]:
H_by_pinv = np.dot(np.linalg.pinv(W), X)

In [None]:
H_by_pinv

In [None]:
H

## Reading and rotating metadata

In [None]:
meta_df = pd.read_csv('../Data/TCGA_OV_VST/TCGA_OV_VST_Metadata.tsv', sep='\t', index_col='RNAID')

In [None]:
meta_df