In [2]:
from data_retrievel_and_feature_extraction import uniprot_info as uni
import pandas as pd

In [3]:
def transofrm_go_terms_to_df(go_terms: dict) -> pd.DataFrame:
    data = pd.DataFrame.from_dict(go_terms, orient='index')
    data = data.reset_index()
    data = data.drop(columns=['index'])
    return data

def transform_go_terms_to_lists(go_terms: dict) -> (list, list, list):
    # Extracting all values from the dictionary
    all_values = [value[0] for value in go_terms.values()]
    # Using list comprehensions to categorize values
    C_list = [value for value in all_values if value.startswith('C')]
    P_list = [value for value in all_values if value.startswith('P')]
    F_list = [value for value in all_values if value.startswith('F')]
    return C_list, P_list, F_list

def transform_lists_to_string(lists: tuple) -> tuple:
    C_list, P_list, F_list = lists
    C_string = ' '.join(C_list)
    P_string = ' '.join(P_list)
    F_string = ' '.join(F_list)
    return C_string, P_string, F_string

def split_go_terms_to_categories(df: pd.DataFrame) -> tuple:
    """Split the GO terms into 3 dataframes, according to the 3 GO categories, and save them to csv.
    Args:
        df (pd.DataFrame): A dataframe containing the GO terms for the given UniProt accession ID
    Returns:
        tuple: A tuple containing the 3 dataframes, each containing the GO terms for the specific gene in the df.
        first dataframe: cellular component
        second dataframe: biological process
        third dataframe: molecular function
    """
    df_cellular_component = df[df[0].str.startswith('C:')]
    df_biological_process = df[df[0].str.startswith('P:')]
    df_molecular_function = df[df[0].str.startswith('F:')]

    return df_cellular_component, df_biological_process, df_molecular_function

In [4]:
### test
# gene_name = 'BRCA1'
# go_terms = uni.get_go_terms(gene_name)
# go_terms_lists = transform_go_terms_to_lists(go_terms)
# # remove first letter from each go term
# go_terms_lists = ([x[2:] for x in go_terms_lists[0]], [x[2:] for x in go_terms_lists[1]], [x[2:] for x in go_terms_lists[2]])
# go_terms_strings = transform_lists_to_string(go_terms_lists)
# print(go_terms_strings)

In [6]:
def update_go_terms(row):
    gene_name = row['Gene']
    go_terms = uni.get_go_terms(gene_name)
    go_terms_lists = transform_go_terms_to_lists(go_terms)
    go_terms_lists = (
        [x[2:] for x in go_terms_lists[0]],
        [x[2:] for x in go_terms_lists[1]],
        [x[2:] for x in go_terms_lists[2]]
    )
    row['Cellular Component'] = go_terms_lists[0]
    row['Biological Process'] = go_terms_lists[1]
    row['Molecular Function'] = go_terms_lists[2]
    return row

# Read the DataFrame
df = pd.read_csv('C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\hl_genes.csv')

# Create a new DataFrame by applying the update_go_terms function to each row
updated_df = df.apply(update_go_terms, axis=1)

# Print the updated DataFrame
print(updated_df.head())

       Gene                                 Cellular Component  \
0  ATP6V1B2  [apical plasma membrane, clathrin-coated vesic...   
1     KITLG  [cytoplasm, cytoskeleton, extracellular region...   
2     TCOF1  [cytosol, fibrillar center, nucleolus, nucleop...   
3     PTPRQ                       [membrane, receptor complex]   
4    COL2A1  [basement membrane, collagen type II trimer, c...   

                                  Biological Process  \
0  [ATP metabolic process, proton transmembrane t...   
1  [cell adhesion, ectopic germ cell programmed c...   
2  [neural crest cell development, neural crest f...   
3  [dephosphorylation, regulation of fat cell dif...   
4  [anterior head development, cartilage condensa...   

                                  Molecular Function  
0  [ATP binding, proton transmembrane transporter...  
1  [cytokine activity, growth factor activity, st...  
2  [protein heterodimerization activity, protein-...  
3            [protein tyrosine phosphatase act

In [7]:
# save df to csv
updated_df.to_csv('C:\\Users\\InbarBlech\\PycharmProjects\\Thesis\\hl_genes_with_go_terms_lists.csv', index=False)