In [41]:
%reload_ext autoreload
%autoreload 2

In [42]:
import time
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# local imports
from ccp.cca1 import Biclustering
from ccp.data_factory import DataFactory


## Importing the data 
In this notebook, we will be exploring the [Gene Expression Data](https://arep.med.harvard.edu/biclustering/yeast.matrix) dataset. Along with the implementation of the biclustering algorithm presented in the paper [Biclustering of Expression Data](https://arep.med.harvard.edu/biclustering/). 
The notebook in divided into three sections:
#### Processing & visualizing the data
In this section, we will be processing and visualizing the data from the yeast matrix dataset associated to the work on this the clustering model was trained.
#### Running the biclustering model
In this section, we will be running the biclustering model on the chosen data. The training will be preformed using CROSS-VALIDATION in order to keep track of the generalization error.
### Visulaizing the results & comparison to the original biclusters
This section will be dedicated to visualizing the results of the biclustering model to the previous biclusters obtrained from the previous work in order to evaluate its accuracy.

# Exploring the data set - Yeast matrix

In [34]:
data_factory = DataFactory()


READ_FILE_PATH = f"{os.getcwd()}/data/raw/yeast_expression.txt"
WRITE_FILE_PATH = f"{os.getcwd()}/data/yeast_expression.csv"
FEATURE_SIZE =17  # fixed value from https://arep.med.harvard.edu/biclustering/

# Preprocess the data
data_factory.clean_file(READ_FILE_PATH, WRITE_FILE_PATH)

### Evaluating the results of the previous experiments

Once the biclusters has been generated. We will evaluate the number of `rows`, `columns` and the `msr` values of every one of them.

####

THis cell is for evaluating the time taken to generate `x` biclusters.

### Preprocessing data & comparing results 
in the following cells we will evaluating the results of the algorithm for exactly `100` biclusters.

In [43]:
# loading the data into a data frame
columns = np.array([f"Cond{i+1}" for i in range(FEATURE_SIZE)])
df = pd.read_csv(WRITE_FILE_PATH, names=columns)

column_names = df.columns.to_list() #extract the name of
data = df.to_numpy()

## checking for any null values 
display(Markdown('### Checking for NAN values - missing values in the data set'))
display(Markdown(df.isna().sum().to_markdown()))

### Checking for NAN values - missing values in the data set

|        |   0 |
|:-------|----:|
| Cond1  |   0 |
| Cond2  |   0 |
| Cond3  |   0 |
| Cond4  |   0 |
| Cond5  |   0 |
| Cond6  |   0 |
| Cond7  |   0 |
| Cond8  |   0 |
| Cond9  |   0 |
| Cond10 |   0 |
| Cond11 |   0 |
| Cond12 |   0 |
| Cond13 |   0 |
| Cond14 |   0 |
| Cond15 |   0 |
| Cond16 |   0 |
| Cond17 |   0 |

In [44]:
print(f"Shape of Yeast Expression matrix : {data.shape}")

Shape of Yeast Expression matrix : (2884, 17)


In [40]:
from ccp.cca2 import CCA
experiements = [100]
for i in tqdm(range(len(experiements))):
    start_time = time.time()
    exp = Biclustering(sigma=300, alpha=1.2, nb_biclusters=experiements[i])
    exp.run(data)
print(f"Time taken to generated {experiements} biclusters : {time.time() - start_time}")


for i in tqdm(range(len(experiements))):
    start_time = time.time()
    exp2 = CCA(sigma=300, alpha=1.2, nb_biclusters=experiements[i])
    exp2.run(data)
print(f"Time taken to generated {experiements} biclusters : {time.time() - start_time}")

  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: operands could not be broadcast together with shapes (995,6) (995,) 

In [26]:
# evaluating the results
from ccp.data_factory import DataFactory
import os
EXP_INDEX = 14

df_fact = DataFactory()
df_fact.write_into_csv(EXP_INDEX, exp.biclusters,data.shape, f'{os.getcwd()}/experiments/yeast-data/cca1/exp-after-debug{EXP_INDEX}.csv')
df_fact.write_into_csv(EXP_INDEX, exp2.biclusters,data.shape, f'{os.getcwd()}/experiments/yeast-data/cca2/exp-{EXP_INDEX}.csv')



In [16]:
# upload benchmark results and compare them to the results obained 
benchmark_yeast_bic = pd.read_csv('./data/validation/benchmark_100_biclusters.csv', names=['rows', 'columns', 'msr', 'total_rows', 'total_columns']).sort_values(by="columns", ascending=False)
ccplus_yeast_bic = pd.read_csv('./experiments/exp-1.csv', ).sort_values(by='rows', ascending=False)

FileNotFoundError: [Errno 2] No such file or directory: './data/validation/benchmark_100_biclusters.csv'

In [8]:
mask = ['rows', 'columns']
benchmark_yeast_bic[mask]

Unnamed: 0,rows,columns
0,689,17
62,4,17
36,4,17
40,8,17
43,6,17
...,...,...
79,66,6
70,167,6
97,413,6
86,28,5


In [9]:
bechmark_yeast_bic2[mask]

Unnamed: 0,rows,columns
0,694,17
44,2,17
25,3,17
28,4,17
34,3,17
...,...,...
64,67,3
77,146,3
81,52,3
99,641,3
