### Code that counts the number of different combinations of features in the discretized Intervention dataset
<li> Counterfactual inference is performed separately on 4 different groups of subjects: females < 65 years, males <65 years, females > 65 years and males > 65 years </li>
<li> Since we use a discretized dataset, more than one patient may have the same combination of input features (i.e., same categorical values) </li>
<li> patients with the same features combination yield the same T2D onset probability, hence a dataset including unique combinations of features is considered during counterfactual inference to speed up computation </li>


In [None]:
import pandas as pd
import numpy as np

In [None]:
#read intervention set with the reduced set of features
df=pd.read_csv(r'.\test_data_intervention.csv',delimiter=',')
df

In [None]:
df.columns

In [None]:
#binarize all features except for FPG and BMI. 
#BMI passes from 6 to 5 levels as 'Underweight' category is excluded due to DPP trial exclusion criteria
df.loc[df['age'] <2, 'age'] = 0
df.loc[df['age'] ==2, 'age'] = 1
df.loc[df['Pressure'] <2, 'Pressure'] = 0
df.loc[df['Pressure'] >=2, 'Pressure'] = 1
df.loc[df['TG'] <2, 'TG'] = 0
df.loc[df['TG'] >=2,'TG'] = 1
df.loc[df['LDL'] <=2, 'LDL'] = 0
df.loc[df['LDL'] >2,'LDL'] = 1
df.loc[df['HDL'] <1, 'HDL'] = 0
df.loc[df['HDL'] >=1,'HDL'] = 1
df['BMI']=df['BMI']-1 #new BMI has values from 0 (normal weight) to 4
df['BMI_final']=df['BMI_final']-1 #new BMI_final has values from 0 to 4

## Value counts for each subgroup and each category 

#### Whole intervention set (N=7730)

In [None]:
for col in df.columns:
    print("Value counts for column '{}':".format(col))
    print(df[col].value_counts())

In [None]:
print('BMI Transition (total): ',df.loc[(df['BMI']!=df['BMI_final']),:].shape)
print('BMI Transition 4->3: ',df.loc[(df['BMI']==4) & (df['BMI_final']==3),:].shape, df.loc[(df['BMI']==4) & (df['BMI_final']==3),:].shape[0]/df.loc[(df['BMI']!=df['BMI_final']),:].shape[0])
print('BMI Transition 3->2: ',df.loc[(df['BMI']==3) & (df['BMI_final']==2),:].shape, df.loc[(df['BMI']==3) & (df['BMI_final']==2),:].shape[0]/df.loc[(df['BMI']!=df['BMI_final']),:].shape[0])
print('BMI Transition 2->1: ',df.loc[(df['BMI']==2) & (df['BMI_final']==1),:].shape, df.loc[(df['BMI']==2) & (df['BMI_final']==1),:].shape[0]/df.loc[(df['BMI']!=df['BMI_final']),:].shape[0])
print('BMI Transition 1->0: ',df.loc[(df['BMI']==1) & (df['BMI_final']==0),:].shape, df.loc[(df['BMI']==1) & (df['BMI_final']==0),:].shape[0]/df.loc[(df['BMI']!=df['BMI_final']),:].shape[0])

print('FPG Transition 1->0: ',df.loc[(df['FPG']==1) & (df['FPG_final']==0),:].shape)

#### Subgroup 1 (age = 0; sex =0): women under 65 years of age (N=2455)

In [None]:
g1=df.loc[(df['age']==0) & (df['sex']==0),:]
g1

In [None]:
for col in g1.columns:
    print("Value counts for column '{}':".format(col))
    print(g1[col].value_counts())

<u> Count the occurrences of each row in Subroup 1: </u>

In [None]:
# Convert each row to a tuple to make it hashable
gx=g1.copy()
gx['row_tuple'] = gx.apply(tuple, axis=1)

# Count occurrences of each row
row_counts = gx['row_tuple'].value_counts().reset_index()
row_counts.columns = ['row_tuple', 'count']
# Convert row_tuple back to DataFrame
tmp = pd.DataFrame(row_counts['row_tuple'].tolist(), columns=g1.columns)
# Concatenate original DataFrame with the count column
g1_unique = pd.concat([tmp, row_counts['count']], axis=1)
g1_unique

In [None]:
print('Total number of rows in subgroup 1:', g1_unique['count'].sum())
print('Total number of UNIQUE rows in subgroup 1:', g1_unique.shape[0])
g1_unique.to_csv(r'.\combinations\age0sex0_comb_5BMI.csv')
#g1_unique.loc[g1_unique['BMI']==0,:].shape

#### Subgroup 2 (age = 0; sex =1): men under 65 years of age (N=2460)

In [None]:
g2=df.loc[(df['age']==0) & (df['sex']==1),:]
g2

In [None]:
for col in g2.columns:
    print("Value counts for column '{}':".format(col))
    print(g2[col].value_counts())

<u> Count the occurrences of each row in Subgroup 2:- </u>

In [None]:
# Convert each row to a tuple to make it hashable
gx=g2.copy()
gx['row_tuple'] = gx.apply(tuple, axis=1)

# Count occurrences of each row
row_counts = gx['row_tuple'].value_counts().reset_index()
row_counts.columns = ['row_tuple', 'count']
# Convert row_tuple back to DataFrame
tmp = pd.DataFrame(row_counts['row_tuple'].tolist(), columns=g1.columns)
# Concatenate original DataFrame with the count column
g2_unique = pd.concat([tmp, row_counts['count']], axis=1)
g2_unique

In [None]:
print('Total number of rows in subgroup 2:', g2_unique['count'].sum())
print('Total number of UNIQUE rows in subgroup 2:', g2_unique.shape[0])

g2_unique.to_csv(r'.\combinations\age0sex1_comb_5BMI.csv')

#### Subgroup 3 (age = 1; sex =0): women above 65 years of age (N=1478)

In [None]:
g3=df.loc[(df['age']==1) & (df['sex']==0),:]
g3

In [None]:
for col in g3.columns:
    print("Value counts for column '{}':".format(col))
    print(g3[col].value_counts())

<u> Count the occurrences of each row in Subgroup 3: </u>

In [None]:
# Convert each row to a tuple to make it hashable
gx=g3.copy()
gx['row_tuple'] = gx.apply(tuple, axis=1)

# Count occurrences of each row
row_counts = gx['row_tuple'].value_counts().reset_index()
row_counts.columns = ['row_tuple', 'count']
# Convert row_tuple back to DataFrame
tmp = pd.DataFrame(row_counts['row_tuple'].tolist(), columns=g1.columns)
# Concatenate original DataFrame with the count column
g3_unique = pd.concat([tmp, row_counts['count']], axis=1)
g3_unique

In [None]:
print('Total number of rows in subgroup 3:', g3_unique['count'].sum())
print('Total number of UNIQUE rows in subgroup 3:', g3_unique.shape[0])
g3_unique.to_csv(r'.\combinations\age1sex0_comb_5BMI.csv')

#### Subgroup 4 (age = 1; sex =1): men above 65 years of age (N=1337)

In [None]:
g4=df.loc[(df['age']==1) & (df['sex']==1),:]
g4

In [None]:
for col in g4.columns:
    print("Value counts for column '{}':".format(col))
    print(g4[col].value_counts())

<u> Count the occurrences of each row in Subgroup 4: </u>

In [None]:
# Convert each row to a tuple to make it hashable
gx=g4.copy()
gx['row_tuple'] = gx.apply(tuple, axis=1)

# Count occurrences of each row
row_counts = gx['row_tuple'].value_counts().reset_index()
row_counts.columns = ['row_tuple', 'count']
# Convert row_tuple back to DataFrame
tmp = pd.DataFrame(row_counts['row_tuple'].tolist(), columns=g1.columns)
# Concatenate original DataFrame with the count column
g4_unique = pd.concat([tmp, row_counts['count']], axis=1)
g4_unique

In [None]:
print('Total number of rows in subgroup 4:', g4_unique['count'].sum())
print('Total number of UNIQUE rows in subgroup 4:', g4_unique.shape[0])
g4_unique.to_csv(r'.\combinations\age1sex1_comb_5BMI.csv')

In [None]:
#save number of combinations and related total number of rows for each subgroup
group_names = ['Subgroup1', 'Subgroup2', 'Subgroup3', 'Subgroup4']
row_counts = [g1_unique['count'].sum(),g2_unique['count'].sum(),g3_unique['count'].sum(),g4_unique['count'].sum()]
combination_counts = [g1_unique.shape[0],g2_unique.shape[0],g3_unique.shape[0],g4_unique.shape[0]]

# Create a dictionary with the data
data = {
    'Group Name': group_names,
    'Total rows count': row_counts,
    'Total unique combinations count': combination_counts
}

df = pd.DataFrame(data)
print(df)

In [None]:
df.to_csv(r'.\combinations\counts.csv')