In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
class Data:
    """
    Load the data categorize the columns to categorical and numerical
    ColumnInfo
    Missing Values
    """
    def __init__(self, filepath: str = None, targetcol: str = None):
        self.data = pd.read_csv(filepath)
        self.data = self.data.drop_duplicates().reset_index()
        self.targetcol = targetcol

    def get_numericals(self):
        self.num_cols = list(self.data.dtypes[
            (self.data.dtypes == 'int64') | (self.data.dtypes == 'float64')| 
            (self.data.dtypes == 'int32') | (self.data.dtypes == 'float32')|
            (self.data.dtypes == 'int16') | (self.data.dtypes == 'float16')|
            (self.data.dtypes == 'int8')].keys())
        if self.targetcol in self.num_cols:
            self.num_cols.remove(self.targetcol)
        self.date_cols = list(self.data.dtypes[
            (self.data.dtypes == 'datetime64[ns]') | (self.data.dtypes == 'timedelta64[ns]')].keys())
        
        self.cat_cols = list(set(self.data.columns).difference(self.num_cols))
        self.cat_cols = list(set(self.cat_cols).difference(self.date_cols)) 
        if self.targetcol in self.cat_cols:
            self.cat_cols.remove(self.targetcol)
        return self.num_cols, self.cat_cols, self.date_cols

    def get_inputs(self):
        num_cols, cat_cols, date_cols = self.get_numericals()
        return num_cols, self.targetcol, self.data, cat_cols, date_cols
    
    def get_missing_percent(self):
        null_percentage = {}
        for i in self.data.columns:
            c = 0
            indices = []
            for j in ['np.nan', np.nan, 'nan', 'Null','NULL', None, 'None', '?', '-', 'Nan']:
                c += len(self.data[self.data[i]== j])
                indices += list(self.data[self.data[i]== j].index)
                c += len(set(self.data[self.data[i].isnull()].index).difference(indices))
                null_percentage[i] = str(round((c/len(self.data))*100,2))+'%'
        return null_percentage

In [3]:
def correlation_ratio(data:pd.DataFrame=None, cat_cols:list=None, numeric_col:str=None):
    """
    Correlation ratio is a symmetric grouping based method that describe the level of correlation between
    a numeric variable and a categorical variable. returns a value in [0,1].
    args:
        data: whole data DataFrame
        cat_cols:  List
        numeric_col: str
    return:
        corr_ratio: correlation ratio
    The limit corr_ratio=0 represents no dispersion among the means of the different categories 
    corr_ratio =1 refers to no dispersion within the respective categories.
    """
    cat_corr_numeric = {}
    for i in cat_cols:
        try:
            overall_mean = np.mean(data[numeric_col])
            temp_df = data[[i,numeric_col]].groupby([i]).agg(['count','mean'])
            temp_df.columns = ["_".join(x) for x in temp_df.columns.ravel()]
            numerator = np.sum(temp_df[numeric_col+'_count']*(temp_df[numeric_col+'_mean']-overall_mean)**2)
            denominator = np.var(data[numeric_col])*len(data)
            corr_ratio = np.sqrt(numerator/denominator)
        except:
            corr_ratio = 0
        cat_corr_numeric[i+'_&_'+numeric_col] = corr_ratio
    return cat_corr_numeric

In [4]:
dataobj = Data("/Users/lingrajsvannur/Desktop/DrugDiscovery/propertyprediction/mordred1826_caco2_wang.tab.csv",'property')
numericals, categoricals, dates = dataobj.get_numericals()
num_cols, targetcol, data, cat_cols, date_cols = dataobj.get_inputs()
column_info = {"numericals":num_cols,"categoricals":cat_cols,"date":date_cols}
print(column_info)
dataobj.get_missing_percent()

num_corr = data.corr()
num_corr

{'numericals': ['index', 'ABC', 'ABCGG', 'nAcid', 'nBase', 'SpAbs_A', 'SpMax_A', 'SpDiam_A', 'SpAD_A', 'SpMAD_A', 'LogEE_A', 'VE1_A', 'VE2_A', 'VE3_A', 'VR1_A', 'VR2_A', 'VR3_A', 'nAromAtom', 'nAromBond', 'nAtom', 'nHeavyAtom', 'nSpiro', 'nBridgehead', 'nHetero', 'nH', 'nB', 'nC', 'nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'nX', 'ATS0dv', 'ATS1dv', 'ATS2dv', 'ATS3dv', 'ATS4dv', 'ATS5dv', 'ATS6dv', 'ATS7dv', 'ATS8dv', 'ATS0d', 'ATS1d', 'ATS2d', 'ATS3d', 'ATS4d', 'ATS5d', 'ATS6d', 'ATS7d', 'ATS8d', 'ATS0s', 'ATS1s', 'ATS2s', 'ATS3s', 'ATS4s', 'ATS5s', 'ATS6s', 'ATS7s', 'ATS8s', 'ATS0Z', 'ATS1Z', 'ATS2Z', 'ATS3Z', 'ATS4Z', 'ATS5Z', 'ATS6Z', 'ATS7Z', 'ATS8Z', 'ATS0m', 'ATS1m', 'ATS2m', 'ATS3m', 'ATS4m', 'ATS5m', 'ATS6m', 'ATS7m', 'ATS8m', 'ATS0v', 'ATS1v', 'ATS2v', 'ATS3v', 'ATS4v', 'ATS5v', 'ATS6v', 'ATS7v', 'ATS8v', 'ATS0se', 'ATS1se', 'ATS2se', 'ATS3se', 'ATS4se', 'ATS5se', 'ATS6se', 'ATS7se', 'ATS8se', 'ATS0pe', 'ATS1pe', 'ATS2pe', 'ATS3pe', 'ATS4pe', 'ATS5pe', 'ATS6pe', 'ATS7pe

Unnamed: 0,index,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,property
index,1.000000,0.122221,0.100485,-0.014273,-0.062070,0.126356,0.037530,0.031524,0.126356,0.142498,...,0.135438,0.099546,-0.055725,-0.103764,0.098032,0.125016,0.125442,-0.007616,0.109986,-0.009426
ABC,0.122221,1.000000,0.983307,0.036261,0.028097,0.995252,0.495895,0.512840,0.995252,0.229744,...,0.903180,0.991173,-0.109691,0.001536,0.952253,0.994940,0.981044,0.873340,0.988105,-0.355469
ABCGG,0.100485,0.983307,1.000000,0.027176,-0.014263,0.973473,0.507460,0.521184,0.973473,0.141403,...,0.901674,0.984824,-0.095805,0.002761,0.944737,0.978165,0.964585,0.917222,0.977815,-0.375327
nAcid,-0.014273,0.036261,0.027176,1.000000,0.065946,0.035752,0.033329,0.032666,0.035752,-0.049325,...,0.028658,0.033362,0.078371,0.086682,0.027115,0.034601,0.034109,0.023000,0.027777,-0.251916
nBase,-0.062070,0.028097,-0.014263,0.065946,1.000000,0.044432,-0.072970,-0.096614,0.044432,0.108822,...,0.019597,0.021439,-0.076074,0.007219,-0.025991,0.012223,-0.002797,-0.076319,0.036344,-0.214312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zagreb1,0.125016,0.994940,0.978165,0.034601,0.012223,0.987061,0.557963,0.572101,0.987061,0.255008,...,0.914635,0.980584,-0.094094,0.004450,0.971871,1.000000,0.995436,0.861619,0.970589,-0.345345
Zagreb2,0.125442,0.981044,0.964585,0.034109,-0.002797,0.971085,0.611562,0.623341,0.971085,0.280297,...,0.915512,0.961346,-0.085211,0.007285,0.983371,0.995436,1.000000,0.843460,0.946507,-0.332678
mZagreb1,-0.007616,0.873340,0.917222,0.023000,-0.076319,0.855239,0.370745,0.404250,0.855239,-0.168892,...,0.735299,0.903841,-0.125242,-0.008610,0.870099,0.861619,0.843460,1.000000,0.891217,-0.380565
mZagreb2,0.109986,0.988105,0.977815,0.027777,0.036344,0.989022,0.403484,0.425372,0.989022,0.175150,...,0.866005,0.989760,-0.153569,-0.003175,0.920789,0.970589,0.946507,0.891217,1.000000,-0.359777


In [5]:
corr_matrix = data.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]
print("columns having high correlation:",to_drop)

columns having high correlation: ['ABCGG', 'SpAbs_A', 'SpAD_A', 'VE3_A', 'VR2_A', 'nAromBond', 'nHeavyAtom', 'ATS6dv', 'ATS7dv', 'ATS0d', 'ATS1d', 'ATS2d', 'ATS3d', 'ATS4d', 'ATS5d', 'ATS6d', 'ATS7d', 'ATS8d', 'ATS1Z', 'ATS2Z', 'ATS3Z', 'ATS4Z', 'ATS5Z', 'ATS6Z', 'ATS7Z', 'ATS8Z', 'ATS0m', 'ATS1m', 'ATS2m', 'ATS3m', 'ATS4m', 'ATS5m', 'ATS6m', 'ATS7m', 'ATS8m', 'ATS0v', 'ATS1v', 'ATS2v', 'ATS3v', 'ATS4v', 'ATS5v', 'ATS6v', 'ATS7v', 'ATS8v', 'ATS0se', 'ATS1se', 'ATS2se', 'ATS3se', 'ATS4se', 'ATS5se', 'ATS6se', 'ATS7se', 'ATS8se', 'ATS0pe', 'ATS1pe', 'ATS2pe', 'ATS3pe', 'ATS4pe', 'ATS5pe', 'ATS6pe', 'ATS7pe', 'ATS8pe', 'ATS0are', 'ATS1are', 'ATS2are', 'ATS3are', 'ATS4are', 'ATS5are', 'ATS6are', 'ATS7are', 'ATS8are', 'ATS1p', 'ATS2p', 'ATS3p', 'ATS4p', 'ATS5p', 'ATS6p', 'ATS7p', 'ATS8p', 'ATS0i', 'ATS1i', 'ATS2i', 'ATS3i', 'ATS4i', 'ATS5i', 'ATS6i', 'ATS7i', 'ATS8i', 'AATS0m', 'AATS1m', 'AATS2m', 'AATS3m', 'AATS4m', 'AATS5m', 'AATS6m', 'AATS7m', 'AATS8m', 'AATS0pe', 'AATS1pe', 'AATS3pe', '

In [9]:
len(to_drop)

507

In [6]:
dispersion_cat = correlation_ratio(data,cat_cols,targetcol)
dispersion_cat

{}

In [None]:
def uncertainty_coeff():
    