# Ch3 Propencity Score

In [13]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

In [3]:
class PropensityScoreMatching:
    def __init__(self, treatment, control, pscore_col='pscore'):
        self.treatment = treatment
        self.control = control
        self.pscore_col = pscore_col
        self.match_idx = {}

    def match(self):
        threshold = pd.concat([self.treatment, self.control])[self.pscore_col].std() * 0.2  # see: https://www.slideshare.net/okumurayasuyuki/ss-43780294
        match_pair_df = self.control.copy(deep=True)
        for i in range(self.treatment.shape[0]):
            matching_target = self.treatment.loc[i, self.pscore_col]
            matched_pair = ((match_pair_df[self.pscore_col] - matching_target)**2).loc[lambda x: x < threshold]
            if matched_pair.shape[0] != 0:  # マッチング対象が存在する場合
                matched_pair = matched_pair.idxmin()
                self.match_idx[i] = matched_pair

                if i % 5000 == 1:
                    print('Matching : [{}]. Propensity Score: {} Matched : [{}]. Propensity Score: {}'.format(
                        i, matching_target, matched_pair, match_pair_df.loc[matched_pair, self.pscore_col]
                    ))

                match_pair_df = match_pair_df.drop(matched_pair)
        print("Matched Ratio: {}".format(len(self.match_idx.keys())/self.treatment.shape[0]))

    def create_matched_df(self, target_col):
        if len(self.match_idx.keys()) == 0:
            raise AttributeError("マッチングが行われていません。match()を呼び出した後に使ってください。")

        columns = ['idx', 'pscore', target_col, 'matched_idx', 'matched_pscore', 'matched_{}'.format(target_col)]
        idx, pscore, target, matched_idx, matched_pscore, matched_target = [], [], [], [], [], []
        for i, v in self.match_idx.items():
            idx += [i]
            pscore += [self.treatment.loc[i, self.pscore_col]]
            target += [self.treatment.loc[i, target_col]]
            matched_idx += [v]
            matched_pscore += [self.control.loc[v, self.pscore_col]]
            matched_target += [self.control.loc[v, target_col]]

        matched_df = pd.DataFrame(data = {col: val for col, val in zip(columns, [idx, pscore, target, matched_idx, matched_pscore, matched_target])})
        matched_df['diff'] = matched_df[target_col] - matched_df['matched_{}'.format(target_col)]
        return matched_df

In [4]:
df = pd.read_csv('Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')

In [5]:
df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [6]:
male_df = df[~(df['segment'] == 'Womens E-Mail')]
male_df['treatment'] = male_df['segment'].apply(lambda x: 1 if x == 'Mens E-Mail' else 0)

In [8]:
sample_rules = (male_df.history > 300) | (male_df.recency < 6) | (male_df.channel=='Multichannel')

In [9]:
treatment_data = male_df[male_df['treatment'] == 1].sample(frac=0.5, random_state=1)
control_data = male_df[male_df['treatment'] == 0].sample(frac=0.5, random_state=1)

In [14]:
ps_model = smf.glm(
    formula='treatment ~ recency + history + channel',
    data=male_df,
    family=sm.families.Binomial()
).fit()

In [15]:
ps_model

<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x7fc32a5de390>