In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Embarked
0,3,male,22.0,1,7.25,S
1,1,female,38.0,1,71.2833,C
2,3,female,26.0,0,7.925,S
3,1,female,35.0,1,53.1,S
4,3,male,35.0,0,8.05,S


## Preprocessing

- linear imputation on Age and Fare
- set embarkation from two missing rows to majority value
- convert numeric variables to categorical ones

In [3]:
df = df.interpolate()
df.loc[pd.isna(df['Embarked']), 'Embarked'] = 'S'

  df = df.interpolate()


In [4]:
bins = [-1, 2, 12, 18, 30, 60, np.inf]
labels = ['baby','child','teenager',"young adult",'adult','senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

bins = [0, 20, 50, 100, np.inf]
labels = ['Cheap', 'Average', 'Above Average', 'Expensive']
df['FareGroup'] = pd.cut(df['Fare'], bins=bins, labels=labels)

# when fare is zero FareGroup is null so we manually fill it as cheap
df.loc[pd.isna(df['FareGroup']), 'FareGroup'] = 'Cheap'
df

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Embarked,AgeGroup,FareGroup
0,3,male,22.0,1,7.2500,S,young adult,Cheap
1,1,female,38.0,1,71.2833,C,adult,Above Average
2,3,female,26.0,0,7.9250,S,young adult,Cheap
3,1,female,35.0,1,53.1000,S,adult,Above Average
4,3,male,35.0,0,8.0500,S,adult,Cheap
...,...,...,...,...,...,...,...,...
1304,3,male,33.5,0,8.0500,S,adult,Cheap
1305,1,female,39.0,0,108.9000,C,adult,Expensive
1306,3,male,38.5,0,7.2500,S,adult,Cheap
1307,3,male,38.5,0,8.0500,S,adult,Cheap


## Helper functions

In [5]:
# return the actual distribution of each column
def get_distributions(df):
    target_columns = ['Pclass', 'Sex', 'AgeGroup', 'FareGroup', 'Embarked', 'FamilySize']
    return {c: df[c].value_counts(normalize=True).round(3).to_dict() for c in target_columns}

# returns the joint distribution 
def get_joint_population_data(df, groups=['Pclass', 'Sex']):
    return df[groups].value_counts(normalize=True).reset_index().sort_values(by=groups, ignore_index=True)

# getting the raw distribution
def fetch_sample_size_per_category(desired_distributions, sample_size=100):
    dd = deepcopy(desired_distributions)
    for category in dd.keys():
        for k in dd[category].keys():
            dd[category][k] = int(dd[category][k]*sample_size)
    return dd

desired_distributions = {
    'Sex':{'male': 0.5, 'female':0.5},
    'Pclass':{1: 1/3, 2:1/3,3:1/3}
}
fetch_sample_size_per_category(desired_distributions)


{'Sex': {'male': 50, 'female': 50}, 'Pclass': {1: 33, 2: 33, 3: 33}}

## Get stratified sample

In [6]:
sex_sample = df.groupby("Sex", group_keys=False).apply(lambda x:x.sample(50, random_state=42))
class_sample = df.groupby("Pclass", group_keys=False).apply(lambda x:x.sample(50, random_state=42))
sample_df = pd.concat([sex_sample, class_sample], ignore_index=True)

# Apply raking

Assuming you know the actual distribution of your dataset for the variables `Pclass` and `Sex`, the goal is to estimate the uknown distribution of another uknown variable e.g. `FareGroup`

The main idea by the following script is that each row gets a weight of importance, based of how underrepresented a category is.

- In the beginning each row gets a weight equal to 1 and then we adjust it accordingly.
    - For instance if we know that the actual male percentage is 65% but in our sample men are at 32.5% then every male row should get a doubled weight.
- We divide the actual ratio by the observed ratio for each category to get the raking factor.
- Then each row gets their weight multiplied by the corresponding factor of `Pclass` and `Sex`

At the end we get a weight for each row, which allows us to estimate the distribution of the uknown variables.


In [7]:
variables = ["Pclass", "Sex"]

population_data = get_distributions(df)

observed_distribution = get_distributions(sample_df)

sample_df["Weight"] = 1

# Calculate raking factors
# by dividing the observed current distribution with the actual share for each variable
# we get a degree of importance
raking_factors = {}
for var in variables:
    raking_factors[var] = {
        category: population_data[var][category] / observed_distribution[var][category]
        for category in population_data[var]
    }

# Apply raking factors to adjust sample weights
sample_df["Weight"] = sample_df["Weight"] * sample_df.apply(
    lambda row: np.prod([raking_factors[var][row[var]] for var in variables]),
    axis=1
)


# we get the estimation of the actual distribution
sample_df.groupby('FareGroup', observed=False)['Weight'].sum() / sample_df.shape[0]


FareGroup
Cheap            0.588861
Average          0.220911
Above Average    0.134534
Expensive        0.060532
Name: Weight, dtype: float64

In [8]:
population_data

{'Pclass': {3: 0.542, 1: 0.247, 2: 0.212},
 'Sex': {'male': 0.644, 'female': 0.356},
 'AgeGroup': {'young adult': 0.4,
  'adult': 0.4,
  'teenager': 0.093,
  'child': 0.053,
  'baby': 0.028,
  'senior': 0.026},
 'FareGroup': {'Cheap': 0.575,
  'Average': 0.241,
  'Above Average': 0.119,
  'Expensive': 0.064},
 'Embarked': {'S': 0.7, 'C': 0.206, 'Q': 0.094},
 'FamilySize': {0: 0.604,
  1: 0.18,
  2: 0.121,
  3: 0.033,
  5: 0.019,
  4: 0.017,
  6: 0.012,
  10: 0.008,
  7: 0.006}}

In [9]:
observed_distribution

{'Pclass': {3: 0.44, 1: 0.296, 2: 0.264},
 'Sex': {'male': 0.568, 'female': 0.432},
 'AgeGroup': {'young adult': 0.428,
  'adult': 0.4,
  'teenager': 0.076,
  'child': 0.044,
  'senior': 0.032,
  'baby': 0.02},
 'FareGroup': {'Cheap': 0.512,
  'Average': 0.252,
  'Above Average': 0.16,
  'Expensive': 0.076},
 'Embarked': {'S': 0.728, 'C': 0.196, 'Q': 0.076},
 'FamilySize': {0: 0.616,
  1: 0.168,
  2: 0.12,
  3: 0.052,
  4: 0.016,
  6: 0.012,
  10: 0.008,
  7: 0.004,
  5: 0.004}}