In [None]:
from __future__ import division, print_function # Imports from __future__ since we're running Python 2
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objects as go #if you have not install plotly package, please run 'pip install jupyter-plotly-dash'
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import skew, kurtosis
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
# Download colon data and label
colon = pd.read_csv('colonCancerData.csv', index_col=0) # Gene expression
colon_label = pd.read_csv('label.csv') # labels 

In [None]:
def ConvertLabels(labels):
    '''
    This function is used to convert the label to 0 and 1 range.
    Label 1 means normal tissue, label 0 means tumor tissue
    '''
    
    column_name = 'label'
    labels.loc[colon_label[column_name] > 0] = 1
    labels.loc[colon_label[column_name] < 0] = -1
    
    return labels

In [None]:
#Convert the colon label and concatenate data and label.
colon_label = ConvertLabels(colon_label)
data_label = pd.concat([colon, colon_label], axis = 1, sort= False)

# Seperate the data into two class. One is with label 1 and another is label 0
data_label_1 = data_label[data_label['label']==1] # normal tissue
data_label_0 = data_label[data_label['label']==-1] # tumor tissue

In [None]:
data_label_1.describe()

In [None]:
temp_value = data_label_1.describe()
max_value = temp_value.iloc[-1]
max_value = max_value.drop("label")
max_value = max_value.T
max_value = max_value.tolist()

min_value = temp_value.iloc[3]
min_value = min_value.drop("label")
min_value = min_value.T
min_value = min_value.tolist()

mean_value = temp_value.iloc[1]
mean_value = mean_value.drop("label")
mean_value = mean_value.T
mean_value = mean_value.tolist()

median_value = temp_value.iloc[5]
median_value = median_value.drop("label")
median_value = median_value.T
median_value = median_value.tolist()

fig12 = go.Figure(data=[go.Table(
        header=dict(
            values=["Max", "Min","Mean","Median"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[max_value,min_value,mean_value,median_value],
            align = "left")
    )
                      ])

fig12.update_layout(width=1000, height=500,title_text="Raw data")

fig12.show()

fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Scatter(y=max_value,name="max",mode='markers'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=min_value,name="min",mode='markers'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(y=mean_value,name="mean",mode='markers'),
    row=2, col=1
)


fig.add_trace(
    go.Scatter(y=median_value,name="median",mode='markers'),
    row=2, col=2
)

fig.update_layout(height=1000, width=1000,title_text="Basic plot of data")
fig.show()

In [None]:
data_label_0.describe()

In [None]:
temp_value = data_label_0.describe()
max_value = temp_value.iloc[-1]
max_value = max_value.drop("label")
max_value = max_value.T
max_value = max_value.tolist()

min_value = temp_value.iloc[3]
min_value = min_value.drop("label")
min_value = min_value.T
min_value = min_value.tolist()

mean_value = temp_value.iloc[1]
mean_value = mean_value.drop("label")
mean_value = mean_value.T
mean_value = mean_value.tolist()

median_value = temp_value.iloc[5]
median_value = median_value.drop("label")
median_value = median_value.T
median_value = median_value.tolist()

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig12 = go.Figure(data=[go.Table(
        header=dict(
            values=["Max", "Min","Mean","Median"],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[max_value,min_value,mean_value,median_value],
            align = "left")
    )
                      ])

fig12.update_layout(width=1000, height=500,title_text="Raw data")

fig12.show()

fig = make_subplots(rows=2, cols=2)

fig.add_trace(
    go.Scatter(y=max_value,name="max",mode='markers'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=min_value,name="min",mode='markers'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(y=mean_value,name="mean",mode='markers'),
    row=2, col=1
)


fig.add_trace(
    go.Scatter(y=median_value,name="median",mode='markers'),
    row=2, col=2
)

fig.update_layout(height=1000, width=1000,title_text="Basic plot of data")
fig.show()

In [None]:
names = data_label_1.columns.tolist()
MAD = []
value = []
outlier_count = []
outlier_colomn_name = []
for i in names:
    temp = data_label_0.loc[:,i]
    temp_1 = abs(temp - np.median(temp))
    temp_2 = np.median(temp_1)
    MAD.append(round(temp_2,2))
    value_temp = round(temp_1 / temp_2,2)
    flag = 0
    
    for j in value_temp:
        if j > temp_2:
            flag += 1
    outlier_count.append(flag)
        
    if flag > 0:
        outlier_colomn_name.append(i)
            
print(outlier_colomn_name)

# Median absolute deviation can roughly estimate std. Estimated std = 1.4826 * median abolute deviation.
# Median absolute deviation can detect the outlier following this instruction.
# The link is: https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
# Assump the constant is 1.
# It can be found that some count number has over 0, which means there are outlier in the dataset.
# We can find that in feature 558, 1240, 1704, 1809, 1860, 1883, 1900, 1969 have the outlier. Maybe we should delete them.


names = data_label_0.columns.tolist()
MAD = []
value = []
outlier_count = []
outlier_colomn_name = []
for i in names:
    temp = data_label_0.loc[:,i]
    temp_1 = abs(temp - np.median(temp))
    temp_2 = np.median(temp_1)
    MAD.append(round(temp_2,2))
    value_temp = round(temp_1 / temp_2,2)
    flag = 0
    
    for j in value_temp:
        if j > temp_2:
            flag += 1
    outlier_count.append(flag)
        
    if flag > 0:
        outlier_colomn_name.append(i)
            
print(outlier_colomn_name)

# We can also find that in label_0, having the same outlier in the same colomn in label_1

outlier_find_table = data_label_1[outlier_colomn_name]
collection_1 = []
for i in outlier_colomn_name:
    collection_1.append(outlier_find_table[i].tolist())

outlier_find_table_1 = data_label_0[outlier_colomn_name]
collection_0 = []
for i in outlier_colomn_name:
    collection_0.append(outlier_find_table_1[i].tolist())

fig = make_subplots(rows=1, cols=2)
for i in range(len(collection_1)):
    
    fig.add_trace(
        go.Box(y=collection_1[i],name="No."+outlier_colomn_name[i]),
        row=1,col=1
        )
    fig.add_trace(
        go.Box(y=collection_0[i],name="No."+outlier_colomn_name[i]),
        row=1,col=2
        )
fig.show()

In [None]:
print('Skewness:\n{}'.format(skew(data_label_1)[:-1])) # Get rid of label column 
print('Kurtosis:\n{}'.format(kurtosis(data_label_0)[:-1])) # Get rid of label column
print('The number of right skewness distribution in normal tissues from sample skewness is:{}'.format(sum(skew(data_label_1)[:-1] > 0)))
print('The number of right skewness distribution in tumor tissues from sample skewness is:{}'.format(sum(skew(data_label_0)[:-1] > 0)))

names = data_label_1.columns.tolist()
GMS_1 = []
GMS_0 = []
flag_1 = 0
flag_0 = 0
for i in names:
    temp_1 = data_label_1.loc[:,i]
    temp_0 = data_label_0.loc[:,i]
    temp_3 = round((((np.percentile(temp_1, 75) - np.percentile(temp_1, 50)) - (np.percentile(temp_1, 50) - np.percentile(temp_1, 25))) / (np.percentile(temp_1, 75) - np.percentile(temp_1, 25))),2)
    temp_4 = round((((np.percentile(temp_0, 75) - np.percentile(temp_0, 50)) - (np.percentile(temp_0, 50) - np.percentile(temp_0, 25))) / (np.percentile(temp_0, 75) - np.percentile(temp_0, 25))),2)
    GMS_1.append(temp_3)
    GMS_0.append(temp_4)
    if temp_3 > 0:
        flag_1 += 1
    if temp_4 > 0:
        flag_0 += 1
print("The number of right skewness distribution in normal tissues from Galton's measure of skewness is:{}".format(flag_1))
print("The number of right skewness distribution in tumor tissues from Galton's measure of skewness is:{}".format(flag_0))

number_list = [1872,2000,1241,1643]
name_list = ["SS_Normal","SS_Tumor","GMS_Normal","GMS_Tumor"]

import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure([go.Bar(x=name_list, y=number_list,text=number_list,textposition='auto')],layout=layout)
fig.show()


In [None]:
#correlation matrix is too large to store in here
data_label_1n = data_label_1.drop(['label'],axis = 1)
data_label_0n = data_label_0.drop(['label'],axis = 1)

names = data_label_1.columns.tolist()
del names[-1]


count_number = []
flag_number = []

for i in range(1999):
    corr80 = []
    for j in range(1999-i):
        
        if data_label_1n[names[i]].corr(data_label_1n[names[i+j+1]]) > 0.8:
            corr80.append(names[i+j+1])
        
    if len(corr80) > 0:
        count_number.append(len(corr80))
        flag_number.append(names[i])

print(count_number)
print(flag_number)

layout = go.Layout(
    plot_bgcolor='white')

fig = go.Figure([go.Bar(x=flag_number, y=count_number,marker=dict(color='black'))],layout=layout)


fig.show()

In [None]:

names = data_label_1.columns.tolist()
names_10 = []
for i in names:
    if i == '10':
        break
    else:
        names_10.append(i)

data_label_1_10 = data_label_1.loc[:,names_10] 
data_label_0_10 = data_label_0.loc[:,names_10] 

fig1 = px.scatter_matrix(data_label_1_10, dimensions = names_10)
fig1.update_traces(diagonal_visible=False)


fig1.update_layout(
                  dragmode='select',
                  width=1000,
                  height=1000,
                  hovermode='closest')
fig1.show()

fig2 = px.scatter_matrix(data_label_0_10, dimensions = names_10)
fig2.update_traces(diagonal_visible=False)


fig2.update_layout(
                  dragmode='select',
                  width=1000,
                  height=1000,
                  hovermode='closest')
fig2.show()