In [2]:
from pypsm import match
import pandas as pd
import seaborn as sns

In [3]:
titanic_data = pd.read_csv('./../data/titanic_data.csv')

In [4]:
titanic_data.pclass.value_counts()

3    491
1    216
2    184
Name: pclass, dtype: int64

### Clean Data
- Make sure to convert data to be all numeric (one-hot encode categorical data)
- Remove data that is unstructured/unique to each row
- Remove any rows that contain null values
- Make sure to not include columns named 'SCORE' or 'index'
- Make sure that treatment group is strictly larger than control group
- Make sure there are at least 10 samples in data

In [5]:
titanic_numerical = titanic_data[[ 'age', 'sibsp', 'parch', 'fare']]
titanic_categorical = titanic_data[['pclass', 'sex', 'embarked']]
titanic_categorical = pd.get_dummies(titanic_categorical)
titanic_categorical['first_class?'] = titanic_categorical.pclass.apply(lambda x: 1 if x == 1 else 0)

In [6]:
titanic_clean = pd.concat([titanic_numerical, titanic_categorical[['first_class?', 'sex_male', 'embarked_C', 'embarked_Q']]], axis=1)
titanic_clean.dropna(inplace=True)

In [7]:
titanic_clean['first_class?'].value_counts()

0    528
1    186
Name: first_class?, dtype: int64

## Match Dataset from Pandas Dataframe
Enter in cleaned dataset as dataframe and treatment column. Optionally incude output csv file.

In [8]:
titanic_final = match.match(titanic_clean, 'first_class?', output_csv='./../data/titanic_matched.csv')

Generating Logistic Regression Model...
Model Generated
Matching Propesensity Scores...
Matching Complete


## Match Dataset from csv file
Enter in cleaned dataset as csv, treatment column, and set is_csv to True. Optionally incude output csv file.

In [11]:
titanic_final_csv = match.match('./../data/titanic_clean_input.csv', 'first_class?', is_csv=True)

Generating Logistic Regression Model...
Model Generated
Matching Propesensity Scores...
Matching Complete


In [9]:
titanic_final

Unnamed: 0,age,sibsp,parch,fare,first_class?,sex_male,embarked_C,embarked_Q,SCORE
1,38.0,1,0,71.2833,1,0,1,0,0.997262
3,35.0,1,0,53.1000,1,0,0,0,0.897860
6,54.0,0,0,51.8625,1,1,0,0,0.995329
11,58.0,0,0,26.5500,1,0,0,0,0.848962
23,28.0,0,0,35.5000,1,1,0,0,0.754749
...,...,...,...,...,...,...,...,...,...
865,42.0,0,0,13.0000,0,0,0,0,0.196114
873,47.0,0,0,9.0000,0,1,0,0,0.163837
874,28.0,1,0,24.0000,0,0,1,0,0.122599
880,25.0,0,1,26.0000,0,0,0,0,0.187454


In [12]:
titanic_final_csv

Unnamed: 0.1,Unnamed: 0,age,sibsp,parch,fare,first_class?,sex_male,embarked_C,embarked_Q,SCORE
1,1,38.0,1,0,71.2833,1,0,1,0,0.989599
3,3,35.0,1,0,53.1000,1,0,0,0,0.822655
5,6,54.0,0,0,51.8625,1,1,0,0,0.984430
10,11,58.0,0,0,26.5500,1,0,0,0,0.734304
20,23,28.0,0,0,35.5000,1,1,0,0,0.615537
...,...,...,...,...,...,...,...,...,...,...
704,880,25.0,0,1,26.0000,0,0,0,0,0.235934
705,881,33.0,0,0,7.8958,0,1,0,0,0.096336
707,883,28.0,0,0,10.5000,0,1,0,0,0.099919
710,886,27.0,0,0,13.0000,0,1,0,0,0.127592
