In [25]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [26]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Embarked
0,3,male,22.0,1,7.25,S
1,1,female,38.0,1,71.2833,C
2,3,female,26.0,0,7.925,S
3,1,female,35.0,1,53.1,S
4,3,male,35.0,0,8.05,S


## Preprocessing

- linear imputation on Age and Fare
- set embarkation from two missing rows to majority value
- convert numeric variables to categorical ones

In [27]:
df = df.interpolate()
df.loc[pd.isna(df['Embarked']), 'Embarked'] = 'S'

  df = df.interpolate()


In [28]:
bins = [-1, 2, 12, 18, 30, 60, np.inf]
labels = ['baby','child','teenager',"young adult",'adult','senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

bins = [0, 20, 50, 100, np.inf]
labels = ['Cheap', 'Average', 'Above Average', 'Expensive']
df['FareGroup'] = pd.cut(df['Fare'], bins=bins, labels=labels)

# when fare is zero FareGroup is null so we manually fill it as cheap
df.loc[pd.isna(df['FareGroup']), 'FareGroup'] = 'Cheap'
df

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Embarked,AgeGroup,FareGroup
0,3,male,22.0,1,7.2500,S,young adult,Cheap
1,1,female,38.0,1,71.2833,C,adult,Above Average
2,3,female,26.0,0,7.9250,S,young adult,Cheap
3,1,female,35.0,1,53.1000,S,adult,Above Average
4,3,male,35.0,0,8.0500,S,adult,Cheap
...,...,...,...,...,...,...,...,...
1304,3,male,33.5,0,8.0500,S,adult,Cheap
1305,1,female,39.0,0,108.9000,C,adult,Expensive
1306,3,male,38.5,0,7.2500,S,adult,Cheap
1307,3,male,38.5,0,8.0500,S,adult,Cheap


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Pclass      1309 non-null   int64   
 1   Sex         1309 non-null   object  
 2   Age         1309 non-null   float64 
 3   FamilySize  1309 non-null   int64   
 4   Fare        1309 non-null   float64 
 5   Embarked    1309 non-null   object  
 6   AgeGroup    1309 non-null   category
 7   FareGroup   1309 non-null   category
dtypes: category(2), float64(2), int64(2), object(2)
memory usage: 64.5+ KB


## Helper functions

In [30]:
def get_distributions(df):
    target_columns = ['Pclass', 'Sex', 'AgeGroup', 'FareGroup', 'Embarked', 'FamilySize']
    # return {c: (df[c].value_counts()/df.shape[0]).round(3).to_dict() for c in target_columns}
    return {c: df[c].value_counts(normalize=True).round(3).to_dict() for c in target_columns}

get_distributions(df)


{'Pclass': {3: 0.542, 1: 0.247, 2: 0.212},
 'Sex': {'male': 0.644, 'female': 0.356},
 'AgeGroup': {'young adult': 0.4,
  'adult': 0.4,
  'teenager': 0.093,
  'child': 0.053,
  'baby': 0.028,
  'senior': 0.026},
 'FareGroup': {'Cheap': 0.575,
  'Average': 0.241,
  'Above Average': 0.119,
  'Expensive': 0.064},
 'Embarked': {'S': 0.7, 'C': 0.206, 'Q': 0.094},
 'FamilySize': {0: 0.604,
  1: 0.18,
  2: 0.121,
  3: 0.033,
  5: 0.019,
  4: 0.017,
  6: 0.012,
  10: 0.008,
  7: 0.006}}

In [31]:
def fetch_sample_size_per_category(desired_distributions, sample_size=100):
    dd = deepcopy(desired_distributions)
    for category in dd.keys():
        for k in dd[category].keys():
            dd[category][k] = int(dd[category][k]*sample_size)
    return dd

desired_distributions = {
    'Sex':{'male': 0.5, 'female':0.5},
    'Pclass':{1: 1/3, 2:1/3,3:1/3}
}
fetch_sample_size_per_category(desired_distributions)

{'Sex': {'male': 50, 'female': 50}, 'Pclass': {1: 33, 2: 33, 3: 33}}

## Get stratified sample

In [39]:
sex_sample = df.groupby("Sex", group_keys=False).apply(lambda x:x.sample(50))
class_sample = df.groupby("Pclass", group_keys=False).apply(lambda x:x.sample(50))
sample_df = pd.concat([sex_sample, class_sample], ignore_index=True)

In [40]:
get_distributions(sample_df)

{'Pclass': {3: 0.4, 1: 0.308, 2: 0.292},
 'Sex': {'male': 0.564, 'female': 0.436},
 'AgeGroup': {'adult': 0.424,
  'young adult': 0.376,
  'teenager': 0.096,
  'child': 0.04,
  'senior': 0.036,
  'baby': 0.028},
 'FareGroup': {'Cheap': 0.5,
  'Average': 0.268,
  'Above Average': 0.16,
  'Expensive': 0.072},
 'Embarked': {'S': 0.648, 'C': 0.256, 'Q': 0.096},
 'FamilySize': {0: 0.596,
  1: 0.196,
  2: 0.144,
  3: 0.028,
  5: 0.016,
  4: 0.012,
  6: 0.008}}

# Apply raking

Assuming you know the actual distribution of your dataset for the variables `Pclass` and `Sex`, the goal is to estimate the uknown distribution of another uknown variable e.g. `FareGroup`


## Raking Algorithm



In [58]:
population_data = get_distributions(df)
# population_data = {k:population_data[k] for k in ['Sex', 'Pclass']}

observed_distribution = get_distributions(sample_df)
observed_distribution

sample_df["Weight"] = 1

# Calculate raking factors
raking_factors = {}
for column in ["Pclass", "Sex"]:
    raking_factors[column] = {
        category: population_data[column][category] / observed_distribution[column][category]
        for category in population_data[column]
    }

print(raking_factors)

# Apply raking factors to adjust sample weights
sample_df["Weight"] = sample_df.apply(
    lambda row: row["Weight"] * raking_factors["Pclass"][row["Pclass"]] * raking_factors["Sex"][row["Sex"]],
    axis=1
)
sample_df

{'Pclass': {3: 1.355, 1: 0.801948051948052, 2: 0.726027397260274}, 'Sex': {'male': 1.1418439716312059, 'female': 0.8165137614678899}}


Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Embarked,AgeGroup,FareGroup,Weight
0,2,female,29.0,0,10.5000,S,young adult,Cheap,0.592811
1,3,female,19.5,0,7.8292,Q,young adult,Cheap,1.106376
2,1,female,17.0,1,108.9000,C,teenager,Expensive,0.654802
3,1,female,35.0,1,53.1000,S,adult,Above Average,0.654802
4,1,female,47.0,1,61.1750,S,adult,Above Average,0.654802
...,...,...,...,...,...,...,...,...,...
245,3,male,40.0,0,7.8958,S,adult,Cheap,1.547199
246,3,male,40.0,0,7.2250,C,adult,Cheap,1.547199
247,3,female,41.0,2,20.2125,S,adult,Average,1.106376
248,3,male,26.0,0,56.4958,S,young adult,Above Average,1.547199


In [53]:
observed_distribution

{'Pclass': {3: 0.4, 1: 0.308, 2: 0.292},
 'Sex': {'male': 0.564, 'female': 0.436},
 'AgeGroup': {'adult': 0.424,
  'young adult': 0.376,
  'teenager': 0.096,
  'child': 0.04,
  'senior': 0.036,
  'baby': 0.028},
 'FareGroup': {'Cheap': 0.5,
  'Average': 0.268,
  'Above Average': 0.16,
  'Expensive': 0.072},
 'Embarked': {'S': 0.648, 'C': 0.256, 'Q': 0.096},
 'FamilySize': {0: 0.596,
  1: 0.196,
  2: 0.144,
  3: 0.028,
  5: 0.016,
  4: 0.012,
  6: 0.008}}

In [54]:
population_data

{'Pclass': {3: 0.542, 1: 0.247, 2: 0.212},
 'Sex': {'male': 0.644, 'female': 0.356},
 'AgeGroup': {'young adult': 0.4,
  'adult': 0.4,
  'teenager': 0.093,
  'child': 0.053,
  'baby': 0.028,
  'senior': 0.026},
 'FareGroup': {'Cheap': 0.575,
  'Average': 0.241,
  'Above Average': 0.119,
  'Expensive': 0.064},
 'Embarked': {'S': 0.7, 'C': 0.206, 'Q': 0.094},
 'FamilySize': {0: 0.604,
  1: 0.18,
  2: 0.121,
  3: 0.033,
  5: 0.019,
  4: 0.017,
  6: 0.012,
  10: 0.008,
  7: 0.006}}

In [56]:
sample_df.loc[sample_df['Sex'] == 'male', 'Weight'].sum() / sample_df.shape[0]
sample_df.loc[sample_df['Sex'] == 'female', 'Weight'].sum() / sample_df.shape[0]
sample_df.loc[sample_df['Pclass'] == 1, 'Weight'].sum() / sample_df.shape[0]
sample_df.loc[sample_df['FareGroup'] == 'Cheap', 'Weight'].sum() / sample_df.shape[0]

0.5808357289402167