In [1]:
%matplotlib inline
from typing import Tuple

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from source import load_avenio_files, categorical_columns_to_lower
from transform import combine_tsv_files, load_process_and_store_spreadsheets, merge_mutation_spreadsheet_t0_with_t1, clean_mutation_columns, clean_and_transpose_data_frame

# First make a spreadsheet with merged t0 and t1.

In [13]:
spread_sheet_filename= "variant_list_20200406.xlsx"
spss_filename= "phenotypes_20200406.sav"
columns = [
    "Allele Fraction",
    "No, Mutant Molecules per mL",
    "CNV Score",
]
patient_mutations, phenotypes = load_avenio_files(
    spread_sheet_filename, spss_filename
)

# Combine the T0 and T1 measurements in a single record.
spread_sheet = merge_mutation_spreadsheet_t0_with_t1(patient_mutations, columns)


spread_sheet.to_excel('output/all__t0_t1__separate.xlsx')

In [2]:
def harmonic_mean(t0, t1):
    return t0 * t1 / (t0 + t1)

load_process_and_store_spreadsheets(
    transformation=harmonic_mean,
    all_filename_prefix="output/all__harmonic_mean_",
    train_filename_prefix="output/train__harmonic_mean_",
    test_filename_prefix="output/test__harmonic_mean_",
)

In [3]:
def difference(t0, t1):
    return t1 - t0

load_process_and_store_spreadsheets(
    transformation=difference,
    all_filename_prefix="output/all__difference_",
    train_filename_prefix="output/train__difference_",
    test_filename_prefix="output/test__difference_",
)

In [4]:
def relative_difference(t0, t1):
    return (t1-t0)/t0

load_process_and_store_spreadsheets(
    transformation=relative_difference,
    all_filename_prefix="output/all__relative_difference_",
    train_filename_prefix="output/train__relative_difference_",
    test_filename_prefix="output/test__relative_difference_",
)

In [5]:
def up_or_down(t0, t1):
    return np.sign(t1 - t0)


load_process_and_store_spreadsheets(
    transformation=up_or_down,
    all_filename_prefix="output/all__sign_",
    train_filename_prefix="output/train__sign_",
    test_filename_prefix="output/test__sign_",
)

In [6]:
# Harmonic mean genomic variable.
X_hm_af, y_hm_af = combine_tsv_files(
    "output/all__harmonic_mean__Allele Fraction.tsv",
    "output/all__harmonic_mean__CNV Score.tsv",
)

X_hm_af.merge(
    y_hm_af, 
    left_index=True, 
    right_index=True
).to_excel('output/all__harmonic_mean__af_cnv.xlsx')

In [8]:
# Harmonic mean genomic variable.
X_hm_molecules, y_hm_molecules = combine_tsv_files(
    "output/all__harmonic_mean__No, Mutant Molecules per mL.tsv",
    "output/all__harmonic_mean__CNV Score.tsv",
)

X_hm_molecules.merge(
    y_hm_molecules, 
    left_index=True, 
    right_index=True
).to_excel('output/all__harmonic_mean__molecules_cnv.xlsx')

In [9]:
# Harmonic mean genomic variable.
X_hm_molecules, y_hm_molecules = combine_tsv_files(
    "output/all__difference__No, Mutant Molecules per mL.tsv",
    "output/all__difference__CNV Score.tsv",
)

X_hm_molecules.merge(
    y_hm_molecules, 
    left_index=True, 
    right_index=True
).to_excel('output/all__difference__molecules_cnv.xlsx')

In [10]:
# Harmonic mean genomic variable.
X_hm_molecules, y_hm_molecules = combine_tsv_files(
    "output/all__relative_difference__No, Mutant Molecules per mL.tsv",
    "output/all__relative_difference__CNV Score.tsv",
)

X_hm_molecules.merge(
    y_hm_molecules, 
    left_index=True, 
    right_index=True
).to_excel('output/all__relative_difference__molecules_cnv.xlsx')