In [39]:
import pandas as pd
import numpy as np
np.random.seed(42)

data = {
    'NumericalFeature1': np.random.randint(0, 100, 100),
    'NumericalFeature2': np.random.randn(100) * 10 + 50,
    'CategoricalFeature1': np.random.choice(['A', 'B', 'C'], 100),
    'CategoricalFeature2': np.random.choice(['X', 'Y', 'Z'], 100),
    'TargetVariable': np.random.choice([0, 1], 100)
}
df = pd.DataFrame(data)
df

Unnamed: 0,NumericalFeature1,NumericalFeature2,CategoricalFeature1,CategoricalFeature2,TargetVariable
0,51,36.533219,B,Y,0
1,92,41.194087,A,Z,0
2,14,38.694477,A,Y,0
3,71,51.344289,B,Z,1
4,60,55.821228,A,Z,1
...,...,...,...,...,...
95,84,48.390626,B,X,1
96,79,46.117356,C,Z,0
97,81,41.144876,B,Z,1
98,52,46.432550,B,X,1


In [40]:
from tableone import TableOne
cat = ['CategoricalFeature1', 'CategoricalFeature2']
cont = ['NumericalFeature1', 'NumericalFeature2']
tableone = TableOne(df, categorical=cat, continuous=cont, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                              │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞══════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                            │    │           │ 100         │ 45          │ 55          │           │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD) │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD) │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1, n (%)   │ A  │           │ 39 (39.0)   │ 

In [41]:
# label encode the categorical features and create a new df
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded = df.copy()
for col in cat:
    df_encoded[col] = le.fit_transform(df[col])
tableone = TableOne(df_encoded, categorical=cat, continuous=cont, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                              │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞══════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                            │    │           │ 100         │ 45          │ 55          │           │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD) │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD) │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1, n (%)   │ 0  │           │ 39 (39.0)   │ 

# label encoding does not change p-value

In [43]:
tableone = TableOne(df_encoded, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒════════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                                │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞════════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                              │    │           │ 100         │ 45          │ 55          │           │
├────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD)   │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD)   │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1, mean (SD) │    │ 0     

# not specificying cat/cont variables *does* change p-value (if encoding)

In [32]:
import statsmodels.api as sm
sm_cols = ['C(' + col + ')' for col in cat] + cont
formula = 'TargetVariable ~ ' + ' + '.join(sm_cols)
print(sm.formula.logit(formula, data=df).fit().summary())
print(sm.formula.logit(formula, data=df_encoded).fit().summary())

Optimization terminated successfully.
         Current function value: 0.619175
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         TargetVariable   No. Observations:                  100
Model:                          Logit   Df Residuals:                       93
Method:                           MLE   Df Model:                            6
Date:                Thu, 29 Aug 2024   Pseudo R-squ.:                  0.1002
Time:                        16:25:15   Log-Likelihood:                -61.917
converged:                       True   LL-Null:                       -68.814
Covariance Type:            nonrobust   LLR p-value:                   0.03204
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -4.8439      1.512     -3.204      0.001   

# using sm formula on label encoded vs non-encoded does not change p-value

In [36]:
X = df_encoded.drop('TargetVariable', axis=1)
Xsm = sm.add_constant(X)
y = df_encoded['TargetVariable']
sm.Logit(y, Xsm).fit().summary()

Optimization terminated successfully.
         Current function value: 0.619794
         Iterations 5


0,1,2,3
Dep. Variable:,TargetVariable,No. Observations:,100.0
Model:,Logit,Df Residuals:,95.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 29 Aug 2024",Pseudo R-squ.:,0.09932
Time:,16:28:15,Log-Likelihood:,-61.979
converged:,True,LL-Null:,-68.814
Covariance Type:,nonrobust,LLR p-value:,0.00843

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.7639,1.490,-3.197,0.001,-7.684,-1.843
NumericalFeature1,0.0066,0.007,0.886,0.376,-0.008,0.021
NumericalFeature2,0.0847,0.027,3.176,0.001,0.032,0.137
CategoricalFeature1,0.1842,0.269,0.686,0.493,-0.342,0.711
CategoricalFeature2,0.2631,0.285,0.924,0.356,-0.295,0.821


# using sm formula vs non-formula does slightly alter p-value

============================================================
# TEST WITH ONE HOT ENCODING
============================================================

In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)

data = {
    'NumericalFeature1': np.random.randint(0, 100, 100),
    'NumericalFeature2': np.random.randn(100) * 10 + 50,
    'CategoricalFeature1': np.random.choice(['A', 'B', 'C'], 100),
    'CategoricalFeature2': np.random.choice(['X', 'Y', 'Z'], 100),
    'TargetVariable': np.random.choice([0, 1], 100)
}
df = pd.DataFrame(data)
df

Unnamed: 0,NumericalFeature1,NumericalFeature2,CategoricalFeature1,CategoricalFeature2,TargetVariable
0,51,36.533219,B,Y,0
1,92,41.194087,A,Z,0
2,14,38.694477,A,Y,0
3,71,51.344289,B,Z,1
4,60,55.821228,A,Z,1
...,...,...,...,...,...
95,84,48.390626,B,X,1
96,79,46.117356,C,Z,0
97,81,41.144876,B,Z,1
98,52,46.432550,B,X,1


In [None]:
from tableone import TableOne
cat = ['CategoricalFeature1', 'CategoricalFeature2']
cont = ['NumericalFeature1', 'NumericalFeature2']
tableone = TableOne(df, categorical=cat, continuous=cont, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                              │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞══════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                            │    │           │ 100         │ 45          │ 55          │           │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD) │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD) │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1, n (%)   │ A  │           │ 39 (39.0)   │ 

In [None]:
# one hot encode the categorical features (as integers) and create a new df
df_encoded = pd.get_dummies(df, columns=cat)
cat_encoded = [c for c in df_encoded.columns if c not in cont + ['TargetVariable']]
df_encoded[cat_encoded] = df_encoded[cat_encoded].astype(int)
tableone = TableOne(df_encoded, categorical=cat_encoded, continuous=cont, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                              │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞══════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                            │    │           │ 100         │ 45          │ 55          │           │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD) │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD) │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├──────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1_A, n (%) │ 0  │           │ 61 (61.0)   │ 

# one hot encoding *does* change p-value

In [None]:
tableone = TableOne(df_encoded, groupby='TargetVariable', sort='P-Value', pval=True)
print(tableone.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════════╤════╤═══════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                                  │    │ Missing   │ Overall     │ 0           │ 1           │ P-Value   │
╞══════════════════════════════════╪════╪═══════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                                │    │           │ 100         │ 45          │ 55          │           │
├──────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature2, mean (SD)     │    │ 0         │ 50.0 (9.3)  │ 46.6 (8.9)  │ 52.8 (8.8)  │ 0.001     │
├──────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ NumericalFeature1, mean (SD)     │    │ 0         │ 50.5 (29.4) │ 47.6 (31.2) │ 53.0 (28.0) │ 0.367     │
├──────────────────────────────────┼────┼───────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ CategoricalFeature1_A, mea

# not specificying cat/cont variables *does* change p-value (if encoding)

In [None]:
import statsmodels.api as sm
sm_cols = ['C(' + col + ')' for col in cat] + cont
formula = 'TargetVariable ~ ' + ' + '.join(sm_cols)
print(sm.formula.logit(formula, data=df).fit().summary())

sm_cols = ['C(' + col + ')' for col in cat_encoded] + cont
formula = 'TargetVariable ~ ' + ' + '.join(sm_cols)
print(sm.formula.logit(formula, data=df_encoded).fit().summary())

Optimization terminated successfully.
         Current function value: 0.619175
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         TargetVariable   No. Observations:                  100
Model:                          Logit   Df Residuals:                       93
Method:                           MLE   Df Model:                            6
Date:                Thu, 29 Aug 2024   Pseudo R-squ.:                  0.1002
Time:                        16:51:33   Log-Likelihood:                -61.917
converged:                       True   LL-Null:                       -68.814
Covariance Type:            nonrobust   LLR p-value:                   0.03204
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -4.8439      1.512     -3.204      0.001   

# using sm formula on label encoded vs non-encoded does not change p-value (of numerical variables)

In [None]:
X = df_encoded.drop('TargetVariable', axis=1)
Xsm = sm.add_constant(X)
y = df_encoded['TargetVariable']
sm.Logit(y, Xsm).fit().summary()

Optimization terminated successfully.
         Current function value: 0.619175
         Iterations 10


0,1,2,3
Dep. Variable:,TargetVariable,No. Observations:,100.0
Model:,Logit,Df Residuals:,93.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 29 Aug 2024",Pseudo R-squ.:,0.1002
Time:,16:51:45,Log-Likelihood:,-61.917
converged:,True,LL-Null:,-68.814
Covariance Type:,nonrobust,LLR p-value:,0.03204

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.6098,8.72e+06,-2.99e-07,1.000,-1.71e+07,1.71e+07
NumericalFeature1,0.0070,0.008,0.919,0.358,-0.008,0.022
NumericalFeature2,0.0848,0.027,3.175,0.001,0.032,0.137
CategoricalFeature1_A,-1.0530,,,,,
CategoricalFeature1_B,-0.8572,,,,,
CategoricalFeature1_C,-0.6995,,,,,
CategoricalFeature2_X,-1.1811,,,,,
CategoricalFeature2_Y,-0.7658,,,,,
CategoricalFeature2_Z,-0.6629,,,,,


# using sm formula vs non-formula does slightly alter p-value