In [84]:
from __future__ import division, print_function # Imports from __future__ since we're running Python 2
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objects as go #if you have not install plotly package, please run 'pip install jupyter-plotly-dash'
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import skew, kurtosis

In [85]:
# Download colon data and label
colon = pd.read_csv('colonCancerData.csv', index_col=0) # Gene expression
colon_label = pd.read_csv('label.csv') # labels 

In [86]:
# Display the basic information about the colon dataframe.
colon.info() 
print("There are {} entries and {} columns in the landsat_train DataFrame"\
      .format(colon.shape[0], colon.shape[1]))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62 entries, 0 to 61
Columns: 2000 entries, 0 to 1999
dtypes: float64(2000)
memory usage: 969.2 KB
There are 62 entries and 2000 columns in the landsat_train DataFrame


In [87]:
def ConvertLabels(labels):
    '''
    This function is used to convert the label to 0 and 1 range.
    Label 1 means normal tissue, label 0 means tumor tissue
    '''
    
    column_name = 'label'
    labels.loc[colon_label[column_name] > 0] = 1
    labels.loc[colon_label[column_name] < 0] = 0
    
    return labels

In [88]:
#Convert the colon label and concatenate data and label.
colon_label = ConvertLabels(colon_label)
data_label = pd.concat([colon, colon_label], axis = 1, sort= False)

# Seperate the data into two class. One is with label 1 and another is label 0
data_label_1 = data_label[data_label['label']==1] # normal tissue
data_label_0 = data_label[data_label['label']==0] # tumor tissue

# Location

In [89]:
data_label_1.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,label
count,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
mean,6179.299432,4476.053,3510.628405,3682.527118,2569.478732,4151.533445,3053.431927,2619.057905,3885.339723,3602.810381,...,80.95267,198.693239,115.928523,111.610909,156.129816,66.504545,91.717443,52.947727,38.20858,1.0
std,2749.895979,1770.275365,1357.981567,1705.196135,1013.273525,2133.940634,1256.356111,727.950035,2027.810311,1482.817217,...,47.98538,116.869028,64.617184,75.386501,134.800399,70.542112,79.549543,37.226403,25.527596,0.0
min,1914.6775,1752.3955,1280.3238,1186.0304,1467.0393,1087.75,1248.1638,1026.4775,1179.935,974.81548,...,26.24,5.925,5.92375,5.91625,12.063095,5.87875,5.84875,5.8425,5.81625,1.0
25%,4525.662225,3408.95115,2571.221875,2679.3326,1885.637825,2068.598225,1958.012825,2083.799075,2677.6413,2911.516325,...,46.889063,114.776563,67.019063,63.616563,66.399107,26.515937,33.155625,37.355,21.694375,1.0
50%,5985.7094,4431.64545,3477.10435,3290.85625,2375.64105,4296.1268,3158.80625,2731.41,3639.66815,3293.48985,...,72.63,181.52375,130.6025,101.3675,113.179165,45.4575,72.1825,47.1675,36.9325,1.0
75%,6964.138125,5172.2142,4216.174375,4029.5268,3031.986575,5736.1821,3836.42565,3233.11125,4327.75535,4753.30595,...,96.963125,258.459375,175.393125,141.478125,175.058933,76.497812,124.30625,63.11125,44.05375,1.0
max,14173.054,8411.8614,6042.84,8766.0464,5967.0857,7736.4679,5059.195,3507.045,10012.81,6400.2274,...,242.845,438.51375,214.44625,372.5175,463.85238,317.15375,334.36875,176.5675,126.82625,1.0


In [90]:
data_label_0.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,label
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,...,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,7475.854712,5236.958872,4415.98259,4155.683475,3139.332172,5009.591523,3883.25294,3011.56553,5117.35945,4279.930317,...,110.829281,345.213937,129.452063,145.05375,199.540386,93.806,127.696906,53.418156,45.582313,0.0
std,3206.13938,2365.546571,1969.447091,2174.680926,1486.235804,2509.113756,2093.156689,1276.28397,2524.391093,2240.314498,...,89.201054,187.316361,81.298223,112.277141,171.83752,93.093341,90.758459,39.593325,29.840126,0.0
min,3230.3287,1383.4886,1269.6487,1359.7911,1166.5536,1304.5571,1062.6975,1077.6162,995.79,1025.3345,...,5.935,31.1025,20.47125,23.96875,5.888095,7.02625,28.81875,6.0,6.73125,0.0
25%,4957.57435,3583.341475,3025.925,2693.5402,1927.280975,3330.80355,2160.368125,1995.99935,3301.7662,2773.472575,...,54.56125,200.083438,75.251562,75.995937,95.359226,31.215625,58.79875,22.485312,25.755625,0.0
50%,6928.0775,5191.05225,4276.01185,3532.96875,3117.59105,4490.61965,3568.7125,2895.69315,4560.94685,3891.2958,...,84.3525,335.894375,109.496875,104.271875,150.396425,68.38125,106.424375,45.358125,34.03375,0.0
75%,9116.309075,6709.954575,5461.2766,4914.5915,4027.961875,6247.41605,4649.00535,3728.463125,6576.4684,5045.731225,...,144.302188,441.225937,178.142812,180.65,215.381845,106.222813,151.769375,72.73,60.384062,0.0
max,14876.407,10152.273,8605.0438,11248.68,8093.875,11222.682,9939.2462,5917.0263,14144.835,12307.913,...,438.38375,902.5725,333.41875,464.93,702.13095,405.6,390.89,197.22,117.19625,0.0


# Scale

In [91]:
arr_var_0 = np.var(data_label_0)
print(arr_var_0)

0        1.002235e+07
1        5.455915e+06
2        3.781754e+06
3        4.611006e+06
4        2.153674e+06
             ...     
1996     8.449711e+03
1997     8.031170e+03
1998     1.528441e+03
1999     8.681723e+02
label    0.000000e+00
Length: 2001, dtype: float64


In [92]:
arr_var_1 = np.var(data_label_1)
print(arr_var_1)

0        7.218204e+06
1        2.991426e+06
2        1.760291e+06
3        2.775526e+06
4        9.800540e+05
             ...     
1996     4.749999e+03
1997     6.040488e+03
1998     1.322814e+03
1999     6.220373e+02
label    0.000000e+00
Length: 2001, dtype: float64


median absolute deviation

In [93]:
names = data_label_1.columns.tolist()
MAD = []
value = []
outlier_count = []
outlier_colomn_name = []
for i in names:
    temp = data_label_0.loc[:,i]
    temp_1 = abs(temp - np.median(temp))
    temp_2 = np.median(temp_1)
    MAD.append(round(temp_2,2))
    value_temp = round(temp_1 / temp_2,2)
    flag = 0
    
    for j in value_temp:
        if j > temp_2:
            flag += 1
    outlier_count.append(flag)
        
    if flag > 0:
        outlier_colomn_name.append(i)
            
print(outlier_colomn_name)
# Median absolute deviation can roughly estimate std. Estimated std = 1.4826 * median abolute deviation.
# Median absolute deviation can detect the outlier following this instruction.
# The link is: https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
# Assump the constant is 1.
# It can be found that some count number has over 0, which means there are outlier in the dataset.
# We can find that in feature 558, 1240, 1704, 1809, 1860, 1883, 1900, 1969 have the outlier. Maybe we should delete them.




['558', '1240', '1704', '1809', '1860', '1883', '1900', '1969']


In [94]:
names = data_label_0.columns.tolist()
MAD = []
value = []
outlier_count = []
outlier_colomn_name = []
for i in names:
    temp = data_label_0.loc[:,i]
    temp_1 = abs(temp - np.median(temp))
    temp_2 = np.median(temp_1)
    MAD.append(round(temp_2,2))
    value_temp = round(temp_1 / temp_2,2)
    flag = 0
    
    for j in value_temp:
        if j > temp_2:
            flag += 1
    outlier_count.append(flag)
        
    if flag > 0:
        outlier_colomn_name.append(i)
            
print(outlier_colomn_name)

# We can also find that in label_0, having the same outlier in the same colomn in label_1

['558', '1240', '1704', '1809', '1860', '1883', '1900', '1969']


Range Xn - X1

In [95]:
names = data_label_1.columns.tolist()
collection = []
for i in names:
    temp = data_label_1.loc[:,i]
    collection.append(round(max(temp) - min(temp),2))

In [96]:
names = data_label_0.columns.tolist()
collection = []
for i in names:
    temp = data_label_0.loc[:,i]
    collection.append(round(max(temp) - min(temp),2))

interquartile range（IQR）

In [147]:
names = data_label_0.columns.tolist()
IQR = []
for i in names:
    temp = data_label_0.loc[:,i]
    IQR.append(round(np.percentile(temp, 75) - np.percentile(temp, 25),2))

In [98]:
names = data_label_1.columns.tolist()
IQR = []
for i in names:
    temp = data_label_1.loc[:,i]
    IQR.append(round(np.percentile(temp, 75) - np.percentile(temp, 25),2))

# Shape

sample skewness and sample kurtosis

In [99]:
print('Skewness:\n{}'.format(skew(data_label_1)[:-1])) # Get rid of label column 
print('Kurtosis:\n{}'.format(kurtosis(data_label_0)[:-1])) # Get rid of label column

Skewness:
[1.20995419 0.56073958 0.21065872 ... 1.43448216 1.68050172 1.84446222]
Kurtosis:
[-0.0733059  -0.63324882 -0.63758575 ...  1.19712934  2.51481495
 -0.2151123 ]


In [100]:
print('The number of right skewness distribution in data_label_1 is:{}'.format(sum(skew(data_label_1)[:-1] > 0)))
print('The number of right skewness distribution in data_label_0 is:{}'.format(sum(skew(data_label_0)[:-1] > 0)))
print('This means the dataset has a larger number of right skewed distribution, which illustrates that in large part of samples, there are lots of large positive outlier.')

The number of right skewness distribution in data_label_1 is:1827
The number of right skewness distribution in data_label_0 is:2000
This means the dataset has a larger number of right skewed distribution, which illustrates that in large part of samples, there are lots of large positive outlier.


In [101]:
print('The number of kurtosis in data_label_1 is:{}'.format(sum(kurtosis(data_label_1)[:-1] > 3)))
print('The number of kurtosis in data_label_0 is:{}'.format(sum(kurtosis(data_label_0)[:-1] > 3)))
print('This means there is a small number of datasets which have a higher peak than standard distribution')

The number of kurtosis in data_label_1 is:192
The number of kurtosis in data_label_0 is:538
This means there is a small number of datasets which have a higher peak than standard distribution


Galton's measure of skewness and robust kurtosis

In [102]:
names = data_label_1.columns.tolist()
GMS_1 = []
GMS_0 = []
flag_1 = 0
flag_0 = 0
for i in names:
    temp_1 = data_label_1.loc[:,i]
    temp_0 = data_label_0.loc[:,i]
    temp_3 = round((((np.percentile(temp_1, 75) - np.percentile(temp_1, 25)) - (np.percentile(temp_1, 50) - np.percentile(temp_1, 25))) / (np.percentile(temp_1, 75) - np.percentile(temp_1, 25))),2)
    temp_4 = round((((np.percentile(temp_0, 75) - np.percentile(temp_0, 25)) - (np.percentile(temp_0, 50) - np.percentile(temp_0, 25))) / (np.percentile(temp_0, 75) - np.percentile(temp_0, 25))),2)
    GMS_1.append(temp_3)
    GMS_0.append(temp_4)
    if temp_3 > 0:
        flag_1 += 1
    if temp_4 > 0:
        flag_0 += 1
print(flag_1)
print(flag_0)
# This means both label 1 or 0 is shown that the third quarter is larger than the range of the first quarter.
    

2000
2000


In [103]:
names = data_label_1.columns.tolist()
GMS_1 = []
GMS_0 = []
flag_1 = 0
flag_0 = 0
for i in names:
    temp_1 = data_label_1.loc[:,i]
    temp_0 = data_label_0.loc[:,i]
    temp_3 = round((((np.percentile(temp_1, 87.5) - np.percentile(temp_1, 62.5)) + (np.percentile(temp_1, 37.5) - np.percentile(temp_1, 12.5))) / (np.percentile(temp_1, 75) - np.percentile(temp_1, 25))),2)
    temp_4 = round((((np.percentile(temp_0, 87.5) - np.percentile(temp_0, 62.5)) + (np.percentile(temp_0, 37.5) - np.percentile(temp_0, 12.5))) / (np.percentile(temp_0, 75) - np.percentile(temp_0, 25))),2)
    GMS_1.append(temp_3)
    GMS_0.append(temp_4)
    if temp_3 > 3:
        flag_1 += 1
    if temp_4 > 3:
        flag_0 += 1
print(flag_1)
print(flag_0)

#From the link said if the kurtosis is over the standard value, that means it would contain the outlier.
#https://baike.baidu.com/item/%E5%B3%B0%E5%BA%A6/10840865?fr=aladdin

12
8


# Multivariate Measures

In [138]:
#correlation matrix is too large to store in here
data_label_1n = data_label_1.drop(['label'],axis = 1)
data_label_0n = data_label_0.drop(['label'],axis = 1)

names = data_label_1.columns.tolist()
del names[-1]
corr80 = []

for i in range(1999):
    
    for j in range(1999-i):
        
        if data_label_1n[names[i]].corr(data_label_1n[names[i+j+1]]) > 0.8:
            corr80.append([names[i],names[i+j+1]])
        
    
print(corr80)


[['0', '20'], ['0', '22'], ['0', '35'], ['0', '45'], ['0', '62'], ['0', '64'], ['0', '74'], ['0', '97'], ['0', '101'], ['0', '120'], ['0', '133'], ['0', '156'], ['0', '197'], ['0', '207'], ['0', '218'], ['0', '224'], ['0', '269'], ['0', '272'], ['0', '280'], ['0', '281'], ['0', '283'], ['0', '294'], ['0', '304'], ['0', '317'], ['0', '320'], ['0', '343'], ['0', '345'], ['0', '360'], ['0', '382'], ['0', '390'], ['0', '427'], ['0', '445'], ['0', '446'], ['0', '474'], ['0', '482'], ['0', '488'], ['0', '498'], ['0', '502'], ['0', '507'], ['0', '519'], ['0', '526'], ['0', '528'], ['0', '620'], ['0', '624'], ['0', '628'], ['0', '665'], ['0', '668'], ['0', '673'], ['0', '698'], ['0', '721'], ['0', '738'], ['0', '739'], ['0', '747'], ['0', '757'], ['0', '767'], ['0', '783'], ['0', '795'], ['0', '817'], ['0', '822'], ['0', '833'], ['0', '845'], ['0', '885'], ['0', '938'], ['0', '947'], ['0', '959'], ['0', '978'], ['0', '981'], ['0', '992'], ['0', '997'], ['0', '1012'], ['0', '1025'], ['0', '1048

# Outlier Detection and Removal

In [165]:
data_label_1n = data_label_1.drop(['label'],axis = 1)
data_label_0n = data_label_0.drop(['label'],axis = 1)

names = data_label_1n.columns.tolist()

count_num_1 = 0
count_num_0 = 0
for i in names:
    temp_1 = data_label_1n.loc[:,i]
    temp_0 = data_label_0n.loc[:,i]

    Lower_1 = np.percentile(temp_1,25) - 1.5 * (np.percentile(temp_1,75) - np.percentile(temp_1,25))
    Upper_1 = np.percentile(temp_1,75) + 1.5 * (np.percentile(temp_1,75) - np.percentile(temp_1,25))
    
    Lower_0 = np.percentile(temp_0,25) - 1.5 * (np.percentile(temp_0,75) - np.percentile(temp_0,25))
    Upper_0 = np.percentile(temp_0,75) + 1.5 * (np.percentile(temp_0,75) - np.percentile(temp_0,25))
    
    for j in temp_1:
        if (j < Lower_1) | (j > Upper_1):
            count_num_1 += 1
            break
    
    for z in temp_0:
        if (z < Lower_0) | (z > Upper_0):
            count_num_0 += 1
            break
    
print(count_num_1)
print(count_num_0)

983
1740
