In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

In [20]:
class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.01):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

In [6]:
pd.set_option("display.max_rows",150)
pd.get_option("display.max_rows")
pd.set_option("display.max_columns",250)
pd.get_option("display.max_columns")
train = pd.read_csv("train.csv")

In [16]:
CATEGORICAL_COLUMNS = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6",\
                       "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1",\
                       "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7",\
                       "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7",\
                       "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3",\
                       "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8",\
                       "Medical_History_9", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14",\
                       "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20",\
                       "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26",\
                       "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31",\
                       "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37",\
                       "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
DUMMY_COLUMNS = ["Medical_Keyword_{}".format(i) for i in range(1, 49)]

categorical_data = train[CATEGORICAL_COLUMNS]
dummy_data = train[DUMMY_COLUMNS]


In [17]:
#Initialize ChiSquare Class
cT = ChiSquare(train)

In [18]:
#Feature Selection
testColumns = ['Embarked','Cabin','Pclass','Age','Name','dummyCat']
for var in CATEGORICAL_COLUMNS:
    cT.TestIndependence(colX=var,colY="Response" )  

Product_Info_1 is IMPORTANT for Prediction
Product_Info_2 is IMPORTANT for Prediction
Product_Info_3 is IMPORTANT for Prediction
Product_Info_5 is IMPORTANT for Prediction
Product_Info_6 is IMPORTANT for Prediction
Product_Info_7 is IMPORTANT for Prediction
Employment_Info_2 is IMPORTANT for Prediction
Employment_Info_3 is IMPORTANT for Prediction
Employment_Info_5 is IMPORTANT for Prediction
InsuredInfo_1 is IMPORTANT for Prediction
InsuredInfo_2 is IMPORTANT for Prediction
InsuredInfo_3 is IMPORTANT for Prediction
InsuredInfo_4 is IMPORTANT for Prediction
InsuredInfo_5 is IMPORTANT for Prediction
InsuredInfo_6 is IMPORTANT for Prediction
InsuredInfo_7 is IMPORTANT for Prediction
Insurance_History_1 is IMPORTANT for Prediction
Insurance_History_2 is IMPORTANT for Prediction
Insurance_History_3 is IMPORTANT for Prediction
Insurance_History_4 is IMPORTANT for Prediction
Insurance_History_7 is IMPORTANT for Prediction
Insurance_History_8 is IMPORTANT for Prediction
Insurance_History_9 is

In [19]:
#Feature Selection
testColumns = ['Embarked','Cabin','Pclass','Age','Name','dummyCat']
for var in DUMMY_COLUMNS:
    cT.TestIndependence(colX=var,colY="Response" )  

Medical_Keyword_1 is IMPORTANT for Prediction
Medical_Keyword_2 is IMPORTANT for Prediction
Medical_Keyword_3 is IMPORTANT for Prediction
Medical_Keyword_4 is IMPORTANT for Prediction
Medical_Keyword_5 is IMPORTANT for Prediction
Medical_Keyword_6 is IMPORTANT for Prediction
Medical_Keyword_7 is IMPORTANT for Prediction
Medical_Keyword_8 is IMPORTANT for Prediction
Medical_Keyword_9 is IMPORTANT for Prediction
Medical_Keyword_10 is IMPORTANT for Prediction
Medical_Keyword_11 is IMPORTANT for Prediction
Medical_Keyword_12 is IMPORTANT for Prediction
Medical_Keyword_13 is IMPORTANT for Prediction
Medical_Keyword_14 is IMPORTANT for Prediction
Medical_Keyword_15 is IMPORTANT for Prediction
Medical_Keyword_16 is IMPORTANT for Prediction
Medical_Keyword_17 is IMPORTANT for Prediction
Medical_Keyword_18 is IMPORTANT for Prediction
Medical_Keyword_19 is IMPORTANT for Prediction
Medical_Keyword_20 is IMPORTANT for Prediction
Medical_Keyword_21 is IMPORTANT for Prediction
Medical_Keyword_22 is 