# MicroArray Data Analysis

```python
# If you have any questions, please run the following command to send me a message :)
from teilab.question import ask
ask(text="I have a question about...")
```

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 200)

### Prepare Required Data

Some data are **"unpublished"**, so please treat it carefully.

In [2]:
from teilab.datasets import TeiLabDataSets

In [3]:
datasets = TeiLabDataSets(verbose=False)

In [5]:
data_dir1 = datasets.get_data(password="<>")
data_dir2 = datasets.get_data(password="<>")

In [6]:
datasets.samples.show_groups()

  idx    gn  GroupName                              FileName
-----  ----  -------------------------------------  ---------------------------------------------------
    0     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_1.txt
    1     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_2.txt
    2     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_3.txt
    3     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_4.txt
    4     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_2_1.txt
    5     1  US91503671_253949442637_S01_GE1_105    US91503671_253949442637_S01_GE1_105_Dec08_1_1.txt
    6     1  US91503671_253949442637_S01_GE1_105    US91503671_253949442637_S01_GE1_105_Dec08_2_3.txt
    7     1  US91503671_253949442637_S01_GE1_105    US91503671_253949442637_S01_GE1_105_Dec08_2_4.txt
    8    

### 2. Read Data & Merge

In [18]:
datasets.samples.show_groups()

  idx  Condition              gn  GroupName                              FileName
-----  -------------------  ----  -------------------------------------  ---------------------------------------------------
    0  mock                    0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_1.txt
    1  siVIM-270               0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_2.txt
    2  siVIM-270(2'OMe2-5)     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_3.txt
    3  siVIM-270(2'OMe6-8)     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_1_4.txt
    4  siVIM-270(2'OMe2-8)     0  SG19378659_257236339458_S001_GE1_1200  SG19378659_257236339458_S001_GE1_1200_Jun14_2_1.txt
    5  mock(1)                 1  US91503671_253949442637_S01_GE1_105    US91503671_253949442637_S01_GE1_105_Dec08_1_1.txt
    6  siVIM-270(LNA3-5)       1  US91503671_

In [7]:
sample_numbers = datasets.samples.get_group_numbers(group_no=1)
print(sample_numbers)

[5, 6, 7, 8, 9, 10, 11, 12]


In [8]:
df_anno = datasets.read_data(no=sample_numbers[0], usecols=datasets.ANNO_COLNAMES)
reliable_index = set(df_anno.index)
print(f"[Before] The number of data: {len(df_anno)}")

df_combined = df_anno.copy(deep=True)
for no in sample_numbers:
    df_data = datasets.read_data(no=no)
    reliable_index = reliable_index & set(datasets.reliable_filter(df=df_data))
    df_combined = pd.concat([
        df_combined, 
        df_data[[datasets.TARGET_COLNAME]].rename(columns={datasets.TARGET_COLNAME: datasets.samples.Condition[no]})
    ], axis=1)

df_combined = df_combined.loc[reliable_index, :].reset_index(drop=True)
print(f"[After] The number of data: {len(df_combined)}")

[Before] The number of data: 62976
[After] The number of data: 20947


### 3. Normalization

In [9]:
target_colnames = datasets.samples.Condition[sample_numbers]
for i,col in enumerate(target_colnames):
    print(i, col)

0 mock(1)
1 siVIM-270(LNA3-5)
2 siVIM-270(LNA2-8)
3 mock(2)
4 siVIM-270
5 siVIM-270(2'OMe4-6)
6 siVIM-270(2'OMe3-7)
7 siVIM-270(2'OMe2-8)


In [10]:
data = df_combined[target_colnames].values.T
SystematicNames = df_combined["SystematicName"].values
n_samples, n_features = data.shape
print(
    f"Number of Samples : {n_samples}",
    f"Number of Features: {n_features}",
    sep="\n"
)

Number of Samples : 8
Number of Features: 20947


In [19]:
# # Normalization
# data_75percentiled = percentile(data=data, percent=75)
# data_quantiled = quantile(data=data)

### 4. Summarization

In [None]:
from teilab.normalizations import median_polish

In [None]:
# Summarization (Median Polish)
data_median_polished               = median_polish(data=data,               labels=SystematicNames)
data_75percentiled_median_polished = median_polish(data=data_75percentiled, labels=SystematicNames)
data_quantiled_median_polished     = median_polish(data=data_quantiled,     labels=SystematicNames)

```python
# Use 'groupby' method for DataFrame.
g = df_combined[target_colnames.tolist() + ["SystematicName"]].groupby("SystematicName")
g.mean()
g.apply(median_polish_group_wise)
```

In [None]:
all_data = [
    data_sumarized,
    data_75percentiled_sumarized,
    data_quantiled_sumarized,
]
data_descriptions = [
    "Raw + Summarization",
    "75%tile + Summarization",
    "quantile + Summarization",
]

### 5. Analysis

In [None]:
from teilab.utils import subplots_create
from teilab.plot.plotly import density_plot

#### Density Plot

In [None]:
density_plot(np.log2(data), labels=target_colnames)

#### XY plot

In [None]:
density_plot(np.log2(data), labels=target_colnames)

#### MA plot