In [None]:
from pathlib import Path
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession.builder \
        .master('local[*]') \
        .config("spark.driver.memory", "15g") \
        .appName('spark') \
        .getOrCreate()

In [2]:
cwd = Path.cwd().parent

# Convert table with features to wide format

**Starting point:**
- Right now all data share the same features across datasources: effectDirection_up, effectDirection_down, effectSize_up, effectSize_down.
- Each row is a d/t association per data source + their features.

**Goal:**
- Convert to wide format in a way that one row constitutes one association and we have the set of features per datasource as columns.

In [10]:
all_data = spark.read.parquet(str(cwd / 'outputs/data_harmonisation'))

print(all_data.show(10))
all_data.groupBy('datasourceId').count().show()

+---------------+-------------+------------+--------------------+------------------+---------------+-------------+
|       targetId|    diseaseId|datasourceId|effectDirection_down|effectDirection_up|effectSize_down|effectSize_up|
+---------------+-------------+------------+--------------------+------------------+---------------+-------------+
|ENSG00000178057|MONDO_0005277|      chembl|                   0|                 1|              0|            1|
|ENSG00000137267|MONDO_0021117|      chembl|                   0|                20|              0|           20|
|ENSG00000168522|  EFO_0002618|      chembl|                   0|                 3|              0|            3|
|ENSG00000131747|MONDO_0001056|      chembl|                   0|                 9|              0|            9|
|ENSG00000185633|  EFO_0003047|      chembl|                   0|                 1|              0|            1|
|ENSG00000157388|  EFO_0007148|      chembl|                   0|               

In [39]:
import pandas as pd

pdf = all_data.toPandas()

pdf_wide = pd.pivot_table(pdf, index=['diseaseId', 'targetId'], columns=['datasourceId'], aggfunc=sum).reset_index().fillna(0)
pdf_wide.columns = pdf_wide.columns.map('_'.join).str.strip('_')

pdf_wide.head()


Unnamed: 0,diseaseId,targetId,effectDirection_down_chembl,effectDirection_down_clinvar,effectDirection_down_crispr,effectDirection_down_expression_atlas,effectDirection_down_gene2phenotype,effectDirection_down_gene_burden,effectDirection_down_impc,effectDirection_down_orphanet,...,effectSize_down_ot_genetics_portal,effectSize_up_chembl,effectSize_up_clinvar,effectSize_up_crispr,effectSize_up_expression_atlas,effectSize_up_gene2phenotype,effectSize_up_gene_burden,effectSize_up_impc,effectSize_up_orphanet,effectSize_up_ot_genetics_portal
0,DOID_10113,ENSG00000113578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DOID_10113,ENSG00000115758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DOID_13406,ENSG00000065989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DOID_13406,ENSG00000073417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DOID_13406,ENSG00000095464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:

pdf_wide.iloc[0]

diseaseId                                       DOID_10113
targetId                                   ENSG00000113578
effectDirection_down_chembl                            0.0
effectDirection_down_clinvar                           0.0
effectDirection_down_crispr                            0.0
effectDirection_down_expression_atlas                  0.0
effectDirection_down_gene2phenotype                    0.0
effectDirection_down_gene_burden                       0.0
effectDirection_down_impc                              0.0
effectDirection_down_orphanet                          0.0
effectDirection_down_ot_genetics_portal                0.0
effectDirection_up_chembl                              2.0
effectDirection_up_clinvar                             0.0
effectDirection_up_crispr                              0.0
effectDirection_up_expression_atlas                    0.0
effectDirection_up_gene2phenotype                      0.0
effectDirection_up_gene_burden                         0