In [1]:
import pandas as pd
from os import chdir
import re
import sys

In [2]:
### change to relevant directory
directory =  r'../inputs'
chdir(directory)

In [3]:
### read table
full_table_df = pd.read_csv(r'full_table.csv', index_col = 0)

In [4]:
full_table_df.drop_duplicates(subset = 'ORF', inplace = True)

In [5]:
### ignore parameters which are not numeric or location
ignore_columns = ['Description', 'gene_sequence', 'protein_sequence', 'Gene Name', 'ORF']
flourescence_columns = ["N' TEF2pr-mCherry in SD Intensity","N' TEF2pr-mCherry in SD Localization",
                        "N' TEF2pr-VC and Cyto-VN in SD Intensity","N' TEF2pr-VC and Cyto-VN in SD Localization",
                        "N' NOP1pr-GFP in SD Intensity","N' NOP1pr-GFP in SD Localization",
                        "N' NATIVEpr-GFP in SD Intensity", "N' NATIVEpr-GFP in SD Localization"]
relevant_columns = [col for col in list(full_table_df) if (col not in ignore_columns) and (not col.startswith('Significance of')) 
                    and not col.startswith('Unnamed') and (col not in flourescence_columns)]

table_to_normalize = full_table_df[relevant_columns].copy()


In [6]:
### where there is more than one location, very inconsistent structure. this function normalizes to same
#structure and sort the locations

def take_care_of_duo_location(x):
    split_x = re.split('[^a-zA-Z]', str(x))## search something which is not a char 
    split_x = [part for part in split_x if part!='']
    sort_x = sorted(split_x)
    joined_x = ','.join(sort_x)
    return joined_x

In [7]:
### this column will be compared against, so changed to standard format
table_to_normalize['control_location'] = table_to_normalize['control_location'].apply(take_care_of_duo_location)


In [8]:
### these columns will be referenced continually
localization_cols = [col for col in relevant_columns if col.split(' ')[-1]=='Localization']

for col in localization_cols:
    ### standardize column
    table_to_normalize[col] = table_to_normalize[col].apply(take_care_of_duo_location)
    # get name for boolean 'has changed' column
    temp_col = 'temp'
    
    # create column
    table_to_normalize[temp_col] = (table_to_normalize[col]!=table_to_normalize['control_location']).astype(float)
    # create one hot encoding
    dummy_columns = pd.get_dummies(table_to_normalize[col], prefix = col).astype(float)
    # add to table
    table_to_normalize = pd.concat([table_to_normalize, dummy_columns], axis = 1)
    del table_to_normalize[col]
    table_to_normalize.rename(columns={temp_col:col}, inplace=True)

In [9]:
# get dummies also for control location which doesnt include in relevant columns 
dummy_columns = pd.get_dummies(table_to_normalize['control_location'], prefix = 'control_location').astype(float)
table_to_normalize = pd.concat([table_to_normalize, dummy_columns], axis = 1)
del table_to_normalize['control_location']



In [10]:
df_without_normalization = table_to_normalize

In [11]:
#do gaussian normalization for all numerial columns without flourescence intensity
table_normalized=(table_to_normalize-table_to_normalize.mean())/table_to_normalize.std()
        
### fill column with mean values
table_normalized = table_normalized.fillna(table_normalized.mean())


In [12]:
# Creating a function that creates a seperated table of each flourescence type that will be hidden from our predictor
## inputs: 2 columns of flourescence intrnsity an flourescence Localization without normalization
## output: a sepatatrated table contains ORF cloumn, flourescence intennsity and localization columns after one hot encoding.
def creating_flourescence_table (flourescence_intensity, flourescence_localization):
    flourescence_table= full_table_df[['ORF']].copy()
    flourescence_table[flourescence_intensity] = full_table_df[flourescence_intensity]
    flourescence_table[flourescence_localization] = full_table_df[flourescence_localization].apply(lambda x: take_care_of_duo_location(x))
# create one hot encoding
    dummy_columns = pd.get_dummies(flourescence_table[flourescence_localization], prefix="one_hot").astype(float)
    # add to table
    flourescence_table = pd.concat([flourescence_table, dummy_columns], axis = 1)
    del flourescence_table[flourescence_localization]
    return flourescence_table


In [13]:
# creat 4 new tables made of ORF and flourescence_columns without normalization
flourescence_table_TEF2pr_mCherry = creating_flourescence_table(flourescence_columns[0], flourescence_columns[1])
flourescence_table_TEF2pr_VC = creating_flourescence_table(flourescence_columns[2], flourescence_columns[3])
flourescence_table_NOP1pr_GFP = creating_flourescence_table(flourescence_columns[4], flourescence_columns[5])
flourescence_table_NATIVEpr_GFP = creating_flourescence_table(flourescence_columns[6], flourescence_columns[7])


In [14]:
#name ORF as index of the main enumerated table
table_normalized['ORF'] = full_table_df['ORF']
table_normalized.set_index('ORF', inplace = True)
df_without_normalization['ORF'] = full_table_df['ORF']
df_without_normalization.set_index('ORF', inplace = True)

In [15]:
# export to csv the main enumerated and the 4 flourescence tables
table_normalized.to_csv('../outputs/normalized_table.csv')
df_without_normalization.to_csv('../outputs/Not_normalized_table.csv')
flourescence_table_TEF2pr_mCherry.to_csv('../outputs/flourescence_table_TEF2pr_mCherry.csv')
flourescence_table_TEF2pr_VC.to_csv('../outputs/flourescence_table_TEF2pr_VC.csv')
flourescence_table_NOP1pr_GFP.to_csv('../outputs/flourescence_table_NOP1pr_GFP.csv')
flourescence_table_NATIVEpr_GFP.to_csv('../outputs/flourescence_table_NATIVEpr_GFP.csv')