In [3]:
import pandas as pd
import numpy as np

### Helper functions

In [4]:
def display_group_density_plot(df, groupby, on, palette, figsize):
    """
    Displays a density plot by group, given a continuous variable, and a group to split the data by
    :param df: DataFrame to display data from
    :param groupby: Column name by which plots would be grouped (Categorical, maximum 10 categories)
    :param on: Column name of the different density plots
    :param palette: Color palette to use for drawing
    :param figsize: Figure size
    :return: matplotlib.axes._subplots.AxesSubplot object
    """

    if not isinstance(df, pd.core.frame.DataFrame):
        raise ValueError('df must be a pandas DataFrame')

    if not groupby:
        raise ValueError('groupby parameter must be provided')

    elif not groupby in df.keys():
        raise ValueError(groupby + ' column does not exist in the given DataFrame')

    if not on:
        raise ValueError('on parameter must be provided')

    elif not on in df.keys():
        raise ValueError(on + ' column does not exist in the given DataFrame')

    if len(set(df[groupby])) > 10:
        groups = df[groupby].value_counts().index[:10]

    else:
        groups = set(df[groupby])

    # Get relevant palette
    if palette:
        palette = palette[:len(groups)]
    else:
        palette = sns.color_palette()[:len(groups)]

    # Plot
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    ax.legend(bbox_to_anchor=(1.04, 1), loc='upper left')

    for value, color in zip(groups, palette):
        sns.kdeplot(df.loc[df[groupby] == value][on], \
                    shade=True, color=color, label=value)

    ax.set_title(str("Distribution of " + on + " per " + groupby + " group"),\
                 fontsize=30)
    
    ax.set_xlabel(on, fontsize=20)
    return ax 

## Looking at some genes data

In [None]:
display_group_density_plot(groupby = 'tumor_stage', \
                           on = 'ENSG00000000971.14', \
                           df = df, \
                           figsize = (15, 6), \
                           palette = sns.color_palette('Set1'));

### Genes correlation

In order to do that, I will seperate the columns into chunks. Because my computer is not strong enough, I will do it on incongruent subsets for now:

In [None]:
corr_to_remove = []
for i in tqdm(range(0, len(genes.columns), 1000)):
    subset = genes.iloc[:, i:i + 1000]
    corr_mat = subset.corr()
    indices = np.where((corr_mat > 0.8) | (corr_mat < -0.8))
    to_remove = [corr_mat.index[x] for x, y in zip(*indices) if x != y and x < y]
    corr_to_remove.extend(to_remove)

In [None]:
corr_to_remove = set(corr_to_remove)
len(corr_to_remove)

In [None]:
df = df.drop(columns=corr_to_remove)

## Looking at clinical data

In [None]:
sns.set(style="darkgrid")

ax = sns.countplot(x = df.tumor_stage, data=df)
plt.xticks(rotation=90)
plt.show()

cat_vars = ['morphology','prior_malignancy',  'site_of_resection_or_biopsy', 'primary_diagnosis', \
            'prior_treatment', 'tissue_or_organ_of_origin', 'race', 'ethnicity']
for cat_vat in cat_vars:
    ax = sns.countplot(x = df.tumor_stage,  hue = df[cat_vat], data=df)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
plt.scatter(clinical['prior_malignancy'], clinical['tumor_stage'])

In [None]:
plt.scatter(clinical['site_of_resection_or_biopsy'], clinical['tumor_stage'])


In [None]:
plt.scatter(clinical['icd_10_code'], clinical['tumor_stage'])

In [None]:
sns.stripplot(x="SaleCondition", y="SalePrice", \
              data=df_house.dropna(subset=["SaleCondition"]), \
              alpha = 0.3, jitter=False);
plt.title('Sale Price distribution by SaleCondition')

plt.show()

sns.stripplot(x="SaleCondition", y="SalePrice", \
              data=df_house.dropna(subset=["SaleCondition"]), \
              alpha = 0.3, jitter=True);
plt.title('Sale Price distribution by SaleCondition')
plt.show()
