In [4]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
# Make sure this path is correct or replace 'your_input_file.csv'
# with the actual path when you run the script.
df = pd.read_csv('/mnt/c/Users/jj515/Desktop/PhD/07-Review/01-data/assoc_disease_target_gen_assoc_kzfps_clust_dom.csv')

# Create an empty list to store edge data
edges_data = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    source_node = row['Unnamed: 0']
    # Iterate over columns, excluding 'Unnamed: 0' and 'Domain type'
    for column_name, value in row.drop(['Unnamed: 0', 'Domain type']).items():
        # Check if the value is not NaN, not zero, and above 0.5
        if pd.notna(value) and value != 0 and value > 0.5:
            target_node = column_name
            weight = value
            edges_data.append({'Source': source_node, 'Target': target_node, 'Weight': weight})

# Create the Edges DataFrame
edges_df = pd.DataFrame(edges_data)

# Add 'Type' column to edges and fill with 'Undirected'
edges_df['Type'] = 'Undirected'

# Create a list of all unique nodes
nodes = pd.concat([edges_df['Source'], edges_df['Target']]).unique()

# Create the Nodes DataFrame with Id and Label columns (Label is a copy of Id)
nodes_df = pd.DataFrame({'Id': nodes, 'Label': nodes}) # Ensured Label column is present

# Add 'Type' attribute to nodes
# Initialize 'Type' column in nodes_df
nodes_df['Type'] = 'Unknown' # Default, will be overwritten

# Identify original cluster nodes (now ZNF) and disease/trait nodes (now Disease)
cluster_node_ids = df['Unnamed: 0'].unique()
disease_trait_node_ids = df.columns.drop(['Unnamed: 0', 'Domain type']).unique()

# Assign new 'Type' values: 'ZNF' and 'Disease'
nodes_df.loc[nodes_df['Id'].isin(cluster_node_ids), 'Type'] = 'ZNF'
nodes_df.loc[nodes_df['Id'].isin(disease_trait_node_ids), 'Type'] = 'Disease'

# Add and populate 'Domain_Type' column in nodes_df
# Initialize Domain_Type column with pandas NA
nodes_df['Domain_Type'] = pd.NA

# Create a mapping from 'Unnamed: 0' (ZNF node IDs) to 'Domain type' from the original dataframe
domain_type_mapping = df.set_index('Unnamed: 0')['Domain type'].to_dict()

# Apply mapping for ZNF nodes
znf_mask = nodes_df['Type'] == 'ZNF'
nodes_df.loc[znf_mask, 'Domain_Type'] = nodes_df.loc[znf_mask, 'Id'].map(domain_type_mapping)

# Set Domain_Type for Disease nodes to 'Disease'
disease_mask = nodes_df['Type'] == 'Disease'
nodes_df.loc[disease_mask, 'Domain_Type'] = 'Disease'

# Fill any remaining NAs in 'Domain_Type' (e.g., ZNF nodes not in mapping) with 'N/A'
# The values for ZNF nodes will be 'KRAB', 'SCAN-KRAB', 'DUF-KRAB' if these are present
# in the 'Domain type' column of your input CSV for the respective ZNF clusters.
nodes_df['Domain_Type'] = nodes_df['Domain_Type'].fillna('N/A')


# Save Edges DataFrame to CSV
edges_output_path = 'KZFPs_OpTar_Gephi_edges.csv'
edges_df.to_csv(edges_output_path, index=False)

# Save Nodes DataFrame to CSV
nodes_output_path = 'KZFPs_OpTar_Gephi_nodes.csv'
nodes_df.to_csv(nodes_output_path, index=False)

print(f"Modified edges file saved to: {edges_output_path}")
print(f"Modified nodes file saved to: {nodes_output_path}")

# Display .head() of the modified dataframes to check
print("\nEdges head:")
print(edges_df.head())
print("\nNodes head:")
print(nodes_df.head())

Modified edges file saved to: KZFPs_OpTar_Gephi_edges.csv
Modified nodes file saved to: KZFPs_OpTar_Gephi_nodes.csv

Edges head:
   Source                           Target    Weight        Type
0  ZNF736  adolescent idiopathic scoliosis  0.506792  Undirected
1  ZNF736              PHF-tau measurement  0.508074  Undirected
2  ZNF100          systolic blood pressure  0.664878  Undirected
3  ZNF100         diastolic blood pressure  0.726817  Undirected
4  ZNF808       neonatal diabetes mellitus  0.607931  Undirected

Nodes head:
       Id   Label Type Domain_Type
0  ZNF736  ZNF736  ZNF        KRAB
1  ZNF100  ZNF100  ZNF        KRAB
2  ZNF808  ZNF808  ZNF        KRAB
3  ZNF251  ZNF251  ZNF        KRAB
4  ZNF268  ZNF268  ZNF        KRAB


In [5]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('/mnt/c/Users/jj515/Desktop/PhD/07-Review/01-data/assoc_disease_target_gen_assoc_kzfps_clust_dom.csv')

# Create an empty list to store edge data
edges_data = []

# Iterate over each row in the DataFrame
for i, row in df.iterrows():
    source_node = row.iloc[0]  # First column (Unnamed: 0)
    
    # Iterate over columns, excluding first column and 'Domain type'
    for j in range(2, len(row) - 1):  # Skip first column and last column (Domain type)
        value = row.iloc[j]
        
        # Check if the value is not NA, not zero, and above 0.5
        if pd.notna(value) and value != 0 and value > 0.5:
            target_node = df.columns[j]
            weight = value
            edges_data.append({'Source': source_node, 'Target': target_node, 'Weight': weight})

# Create the Edges DataFrame
edges_df = pd.DataFrame(edges_data)

# Generate graph data file (edges) for Cosmograph
edges_cosmograph = edges_df.rename(columns={'Source': 'source', 'Target': 'target', 'Weight': 'weight'})

# Save edges file
edges_cosmograph.to_csv('cosmograph_edges.csv', index=False)

# Create a list of all unique nodes
nodes = pd.unique(edges_df[['Source', 'Target']].values.ravel('K'))

# Create the Nodes DataFrame with Id and Label
nodes_df = pd.DataFrame({'id': nodes, 'Label': nodes})

# Add 'Type' attribute to nodes
nodes_df['Type'] = 'Unknown'

# Identify cluster nodes
cluster_nodes = df.iloc[:, 0].unique()  # First column
nodes_df.loc[nodes_df['id'].isin(cluster_nodes), 'Type'] = 'Cluster'

# Identify disease/trait nodes
disease_trait_nodes = df.columns[2:-1]  # Exclude first column and last column
nodes_df.loc[nodes_df['id'].isin(disease_trait_nodes), 'Type'] = 'Disease/trait'

# Add Domain_Type from original dataframe
domain_type_mapping = df.set_index(df.columns[0])['Domain type'].to_dict()
nodes_df['Domain_Type'] = nodes_df['id'].map(domain_type_mapping).fillna('N/A')

# Save nodes file for Cosmograph
nodes_cosmograph = nodes_df.rename(columns={'id': 'id', 'Label': 'label', 'Type': 'type', 'Domain_Type': 'domain_type'})
nodes_cosmograph.to_csv('cosmograph_nodes.csv', index=False)

print("Edges file saved to: cosmograph_edges.csv")
print("Nodes file saved to: cosmograph_nodes.csv")


Edges file saved to: cosmograph_edges.csv
Nodes file saved to: cosmograph_nodes.csv
