In [15]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.stats.stats import pearsonr

In [16]:
#open train file
trainDF = pd.read_csv("train.csv")
trainDF.shape

(217, 2603)

In [17]:
testDF = pd.read_csv("test.csv")
testDF.shape

(55, 2603)

In [18]:
#Class = target, X_train = all column except target/predictor
Class = trainDF.iloc[:,-1]
X_train = trainDF.iloc[:,:-1]

In [19]:
# removing zero variance
X_train = X_train.loc[:, X_train.var() != .0]
X_train.shape

(217, 2153)

In [20]:
# removing low stdev
X_train = X_train.loc[:, X_train.std() > .95]
X_train.shape

(217, 950)

In [21]:
# REMOVE DESCRIPTOR WITH SIMILAR VALUE MORE THAN 50%
num_desc = X_train.shape[1]
lim_ = 0.50
red_lim = int(lim_ * X_train.shape[0])
label_ = X_train.columns.values
rem_idx = np.arange(num_desc).tolist()
for i in range(num_desc):
    dup_val = X_train.pivot_table(index=["{}".format(label_[i])], aggfunc='size')
    tmp = dup_val.tolist()
    flag = 0
    for x in tmp:
        if x > red_lim:
            flag = 1
    if flag == 1:
        rem_idx.remove(i)
label_idx = label_[rem_idx]
X_train = X_train.loc[:,label_idx]
print("descriptor number after removing descriptor with similar value more than {}%: {}".format((lim_*100),X_train.shape[1]))

descriptor number after removing descriptor with similar value more than 50.0%: 854


In [22]:
corr_y = []
for i in range(X_train.shape[1]):
    tmp = pearsonr(X_train.iloc[:,i], Class)
    tmp = np.abs(tmp[0])
    corr_y.append(tmp)

In [23]:
idx_ = []
for i in range(len(corr_y)):
    if (corr_y[i] > 0.20):
        idx_.append(i)

In [24]:
X_train = X_train.iloc[:,idx_]
X_train.shape

(217, 22)

In [25]:
# -----------REMOVE FEATURE WITH LOW CORRELATION TO Y RESPONSE (Pearson < 0.1)---------------------
label_ = X_train.columns.values
pic50 = Class
corr_y = [pearsonr(X_train.iloc[:,i],pic50) for i in range(X_train.shape[1])]
corr_y = [np.abs(corr_y[i][0]) for i in range(X_train.shape[1])]
corr_lim = 0.10
hi_corr = []
for i in range(len(corr_y)):
    if corr_y[i] > corr_lim:
        hi_corr.append(i)
label_idx = label_[hi_corr]
X_train = X_train.loc[:,label_idx]
print("descriptor number after removing low correlation with target: {}".format(X_train.shape[1]))

descriptor number after removing low correlation with target: 22


In [26]:
# -----------REMOVE FEATURE WITH HIGH CORRELATION TO OTHER FEATURE (Pearson > 0.9)---------------------
# re-calculate correlation with pic50
label_ = X_train.columns.values
corr_y = [pearsonr(X_train.iloc[:,i],Class) for i in range(X_train.shape[1])]
corr_y = [np.abs(corr_y[i][0]) for i in range(X_train.shape[1])]
desc_num = X_train.shape[1]
# calculate correlation for each descriptor
corr_matrix = np.corrcoef(X_train.T)
corr_lim = 0.90
low_corr = np.arange(desc_num).tolist()
tmp = np.arange(desc_num).tolist()
for i in np.arange(desc_num):
    tmp.remove(i)
    for j in tmp:
        corr_ = np.abs(corr_matrix[i,j])
        if corr_ >= corr_lim:
            if corr_y[i] > corr_y[j]:
                if j in low_corr:
                    low_corr.remove(j)
            else:
                if i in low_corr:
                    low_corr.remove(i)
label_idx = label_[low_corr]
X_train = X_train.loc[:,label_idx]
print("descriptor number after removing high correlation descriptor: {}".format(X_train.shape[1]))

descriptor number after removing high correlation descriptor: 11


In [27]:
label = X_train.columns
label

Index(['ATSC2m', 'AATSC0v', 'SM1_Dzs', 'n6Ring', 'AATS1dv', 'AATS2dv',
       'C1SP2.1', 'C2SP2.1', 'SaasC.1', 'SMR_VSA5', 'SRW07'],
      dtype='object')

In [28]:
# store the data
f = open('label_stat.pkl', 'wb')
pickle.dump((label), f)
f.close()