In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt
!{sys.executable} -m pip install --upgrade git+https://github.com/hms-dbmi/pic-sure-python-adapter-hpds.git 
!{sys.executable} -m pip install --upgrade git+https://github.com/hms-dbmi/pic-sure-python-client.git 

In [2]:
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import PicSureHpdsLib
import PicSureClient

from python_lib.utils import get_multiIndex_variablesDict,\
                             joining_variablesDict_onCol

from python_lib.wrappers import get_one_study


In [3]:
# Pandas DataFrame display options
pd.set_option("max.rows", 435)

# Matplotlib display parameters
plt.rcParams["figure.figsize"] = (14,8)
font = {'weight' : 'bold',
        'size'   : 12}
plt.rc('font', **font)

In [10]:
PICSURE_network_URL = "https://picsure.biodatacatalyst.nhlbi.nih.gov/picsure"
resource_id = "02e23f52-f354-4e8b-992c-d37c8b9ba140"
token_file = "token.txt"

In [11]:
with open(token_file, "r") as f:
    token = f.read()

In [12]:
client = PicSureClient.Client()
connection = client.connect(PICSURE_network_URL, token)

+--------------------------------------+------------------------------------------------------
|  Resource UUID                       |  Resource Name                                  
+--------------------------------------+------------------------------------------------------
| 02e23f52-f354-4e8b-992c-d37c8b9ba140
+--------------------------------------+------------------------------------------------------


In [13]:
adapter = PicSureHpdsLib.Adapter(connection)
resource = adapter.useResource(resource_id)

## Get studies information

In [14]:
from ast import literal_eval
studies_info = pd.read_csv("./studies_info.csv",
                           index_col=0, 
                          converters={"phs_list": literal_eval})

FileNotFoundError: [Errno 2] File ./studies_info.csv does not exist: './studies_info.csv'

## Get individual studies statistics

In [15]:
plain_variablesDict = resource.dictionary().find().DataFrame()
variablesDict = get_multiIndex_variablesDict(plain_variablesDict)

In [16]:
mask_consent_var = plain_variablesDict.index.str.contains("Study Accession with Consent Code")
consent_var = plain_variablesDict.index[mask_consent_var].values[0]

In [17]:
#for phs in phs_index:

In [18]:
phs = studies_info.index[1]

NameError: name 'studies_info' is not defined

In [None]:
facts = get_one_study(phs, 
                      studies_info, 
                      consent_var, 
                      variablesDict, 
                      resource,
                      low_memory=False)

In [None]:
facts.shape

## Matplotlib palettes

In [None]:
import sys
!{sys.executable} -m pip install colorspacious

In [None]:
from matplotlib import cm
from colorspacious import cspace_converter
from collections import OrderedDict

cmaps = OrderedDict()

In [None]:
cmaps['Qualitative'] = ['Pastel1', 'Pastel2', 'Paired', 'Accent',
                        'Dark2', 'Set1', 'Set2', 'Set3',
                        'tab10', 'tab20', 'tab20b', 'tab20c']

In [None]:
cmaps["Qualitative"][6]

# Describe the study

## BASIC STATISTICS

In [None]:
print("Number of subjects: {0}\nNumber of variables: {1}".format(*facts.shape))

# Variable types description

In [None]:
var_dtypes = facts.dtypes.value_counts()
#eav_dic["var_dtypes"] = var_dtypes.to_dict()

In [None]:
fig, ax = plt.subplots()
plt.bar(x=var_dtypes.index.astype("str").tolist(), height=var_dtypes.values.tolist())
for x, y in enumerate(var_dtypes):
    ax.text(x, y, y, ha="center", va="bottom")
ax.set_title("Variable count per variable type")

### Number of subjects (non-null values) per variable type

In [None]:
num = num_describe.loc["count",:].to_frame().assign(dtype="numerical")
categorical = categorical_describe.loc["count",:].to_frame().assign(dtype="categorical")
long_df = pd.concat([num, categorical], axis=0).astype({"count": int})

In [None]:
ax = long_df.boxplot(column = "count", by="dtype", grid=False)
ax.set_xlabel("Variable type")
ax.set_ylabel("Variable count")
ax.set_title("Variable count per dtype")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
long_df.loc[long_df["dtype"] == "numerical",:].hist("count", color="tab:orange", ax=ax1)
ax1.set_title("Numerical variables")
long_df.loc[long_df["dtype"] == "categorical",:].hist("count", color="tab:blue", ax=ax2)
ax2.set_title("Categorical variables")
x_label = "Number of subjects"
y_label = "Variable count"
ax1.set_xlabel(x_label), ax2.set_xlabel(x_label)
ax1.set_ylabel(y_label), ax2.set_ylabel(y_label)
fig.suptitle("Number of subjects per variable type")

## CATEGORICAL VARIABLES DESCRIPTION

In [None]:
categorical_describe = facts.describe(include=['object'])
categorical_describe

In [None]:
mask_id = categorical_describe.loc["freq",:] != 1
categorical_describe.loc[:, mask_id]

### Number of modalities

In [None]:
max_nonnull = long_df["count"].max()
mask_categorical = facts.dtypes == "object"
number_modalities = facts.loc[:, mask_categorical].apply(lambda x: x.unique().shape[0])
filter_max_cat = max_nonnull*0.2
mask_modalities = number_modalities.between(2, filter_max_cat)

In [None]:
ax = number_modalities[mask_modalities].value_counts().sort_index().plot(kind="bar")
ax.set_title("Number of modalities for categorical variable (filtering < {0} modalities)".format(filter_max_cat.round(0)))
ax.set_ylabel("Variable count")
ax.set_xlabel("Number of modalities")

### NUMERICAL VARIABLES DESCRIPTION

In [None]:
numerical_describe = facts.describe(include=["float", "int"]).transpose()
numerical_describe

# Quality checking / data filtering

Discarding data that did show the subsequent criteria:
- Using variable name:
    - Filtering variables which names includes: ["ID", "identifiant"]
- Using variable distribution: 
    - No non-null values
    - Only 1 unique value (ie every non-null values identical)

Transforming data type:
- Numerical data that only posess 2-3 distinct integer values: recoded as categorical
- 

In [None]:
var = np.NaN
var.is_integer()

In [None]:
mask = facts.dtypes == "int"
facts.loc[:, mask]