In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_stata('colon.dta').set_index('id').loc[:, ['sex', 'age', 'surv_mm', 'status', 'stage', 'subsite']]
df = df[df['status'].isin(['Dead: cancer', 'Alive'])]
df['sex'] = df['sex'].replace({'Female': 0, 'Male': 1})
df['status'] = df['status'].replace({'Dead: cancer': 1, 'Alive': 0})
df = df.rename({"status": "event", "surv_mm": "tte"}, axis=1)
df = pd.concat([df, pd.get_dummies(df.loc[:, 'stage'], prefix='stage').drop(columns='stage_Unknown')], axis=1).drop('stage', axis=1)
df = pd.concat([df, pd.get_dummies(df.loc[:, 'subsite'], prefix='subsite').drop(columns='subsite_Other and NOS')], axis=1).drop('subsite', axis=1)
X = df.drop(columns=["event", "tte"])
y = df.loc[:, ["event", "tte"]]
X = pd.DataFrame(StandardScaler().fit_transform(X), index=X.index, columns=X.columns)
df = pd.concat([X, y], axis=1)
df = df.round(4)
df.to_csv('data.csv', index=False)
df = df.sample(frac=1, random_state=42)
df

Unnamed: 0_level_0,sex,age,stage_Localised,stage_Regional,stage_Distant,subsite_Coecum and ascending,subsite_Transverse,subsite_Descending and sigmoid,event,tte
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6000,-0.8157,0.3685,-0.7536,-0.3652,1.287,-0.7439,2.1427,-0.8204,1,2.5
3370,-0.8157,1.6210,-0.7536,-0.3652,1.287,-0.7439,-0.4667,1.2190,1,12.5
7668,-0.8157,-0.4143,-0.7536,-0.3652,1.287,-0.7439,-0.4667,1.2190,1,0.5
2601,1.2260,-0.7274,1.3269,-0.3652,-0.777,-0.7439,-0.4667,1.2190,0,173.5
14387,-0.8157,-0.5709,-0.7536,-0.3652,1.287,1.3443,-0.4667,-0.8204,1,15.5
...,...,...,...,...,...,...,...,...,...,...
14469,1.2260,-1.0406,-0.7536,-0.3652,-0.777,-0.7439,-0.4667,1.2190,0,23.5
6647,-0.8157,-1.5885,1.3269,-0.3652,-0.777,-0.7439,-0.4667,-0.8204,0,135.5
6894,1.2260,0.2120,1.3269,-0.3652,-0.777,1.3443,-0.4667,-0.8204,0,128.5
1206,1.2260,0.3685,1.3269,-0.3652,-0.777,-0.7439,-0.4667,1.2190,0,171.5


In [5]:
# split df into three parts with 20%, 50%, and 30% of the data
num_rows = len(df)
rows_20_percent = int(0.2 * num_rows)
rows_50_percent = int(0.5 * num_rows)
rows_30_percent = num_rows - rows_20_percent - rows_50_percent

client1 = df.iloc[:rows_20_percent]
client2 = df.iloc[rows_20_percent:rows_20_percent + rows_50_percent]
client3 = df.iloc[rows_20_percent + rows_50_percent:]

client1.to_csv('../../federated-analysis/synthetic/3_clients/client_1/data.csv', index=False)
client2.to_csv('../../federated-analysis/synthetic/3_clients/client_2/data.csv', index=False)
client3.to_csv('../../federated-analysis/synthetic/3_clients/client_3/data.csv', index=False)

print(df.shape)
print(client1.shape[0] + client2.shape[0] + client3.shape[0])
print(client1.shape + client2.shape + client3.shape)

(13011, 10)
13011
(2602, 10, 6505, 10, 3904, 10)


In [6]:
# split df into five parts with 20% each
num_rows = len(df)
rows_20_percent = int(0.2 * num_rows)

client1 = df.iloc[:rows_20_percent]
client2 = df.iloc[rows_20_percent:2 * rows_20_percent]
client3 = df.iloc[2 * rows_20_percent:3 * rows_20_percent]
client4 = df.iloc[3 * rows_20_percent:4 * rows_20_percent]
client5 = df.iloc[4 * rows_20_percent:]


client1.to_csv('../../federated-analysis/synthetic/5_clients/client_1/data.csv', index=False)
client2.to_csv('../../federated-analysis/synthetic/5_clients/client_2/data.csv', index=False)
client3.to_csv('../../federated-analysis/synthetic/5_clients/client_3/data.csv', index=False)
client4.to_csv('../../federated-analysis/synthetic/5_clients/client_4/data.csv', index=False)
client5.to_csv('../../federated-analysis/synthetic/5_clients/client_5/data.csv', index=False)

print(df.shape)
print(client1.shape[0] + client2.shape[0] + client3.shape[0] + client4.shape[0] + client5.shape[0])
print(client1.shape + client2.shape + client3.shape + client4.shape + client5.shape)

(13011, 10)
13011
(2602, 10, 2602, 10, 2602, 10, 2602, 10, 2603, 10)
