# Treatment Effect

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from scipy.stats import ttest_ind, f_oneway

from lib import data

pd.options.display.max_columns = None
sns.set()

In [61]:
df = data.read_train_data()
df['PositiveScore'] = df.filter(regex="P\d+").sum(axis=1)
df['NegativeScore'] = df.filter(regex="N\d+").sum(axis=1)
df['CompositeScore'] = df['PositiveScore'] - df['NegativeScore']

## Regression Approach

In [99]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [100]:
pvalues = []
studies = ('A', 'B', 'C', 'D', 'E')

for study in studies:
    df = data.read_study_data(study)
#     df.groupby('PatientID').filter(lambda g: g.VisitDay.max() > 42)

    clf = smf.ols("PANSS_Total ~ VisitDay*TxGroup", data=df).fit()
    pvalues.append(clf.pvalues)
    
pd.DataFrame(pvalues, index=studies)

Unnamed: 0,Intercept,TxGroup[T.Treatment],VisitDay,VisitDay:TxGroup[T.Treatment]
A,0.0,0.472252,4.684985e-71,0.181463
B,0.0,0.066408,1.4372230000000002e-81,0.900768
C,0.0,0.048763,0.0,0.036105
D,0.0,0.103616,4.372337e-122,0.255916
E,0.0,0.373611,5.643534e-08,0.964456


In [108]:
df = data.read_train_data()
df = df[df.Study.isin({'C', 'D'})]

clf = smf.ols("PANSS_Total ~ VisitDay*TxGroup", data=df).fit()
pd.DataFrame({
    'Coefficients': clf.params,
    'P-values': clf.pvalues,
})

Unnamed: 0,Coefficients,P-values
Intercept,79.156247,0.0
TxGroup[T.Treatment],0.936215,0.002817
VisitDay,-0.104724,0.0
VisitDay:TxGroup[T.Treatment],-0.004885,0.024209


## Additional Experiments

These were experiments that were less about trying to prove a hypothesis and more about exploring the data. This opens us up to Type I errors, but we weren't planning on using these experiments anyway.

In [12]:
df = data.read_study_data('C')
df['PositiveScore'] = df.filter(regex="P\d+").sum(axis=1)
df['NegativeScore'] = df.filter(regex="N\d+").sum(axis=1)
df['CompositeScore'] = df['PositiveScore'] - df['NegativeScore']
df = df[df.LeadStatus == "Passed"]

In [13]:
def get_group_delta(dfg):
    visit_days = dfg.VisitDay
    totals = dfg.PositiveScore
    
    return pd.DataFrame({'VisitDay': dfg.VisitDay.iloc[1:], 'ScoreDelta': totals.iloc[1:] - totals.iloc[0]})


def get_panss_delta(df):
    dg = df.groupby(['TxGroup', 'PatientID'])
    return dg.apply(get_group_delta)

In [14]:
deltas = get_panss_delta(df).reset_index()
deltas['VisitMonth'] = (deltas.VisitDay / 28).astype(int)

In [15]:
def ttest_group(g):
    cidx = g.TxGroup == 'Control'    
    return pd.DataFrame({
        'Population': [len(g)],
        'PValue': [f_oneway(g[cidx].ScoreDelta, g[~cidx].ScoreDelta).pvalue],
        'ControlMean': g[cidx].ScoreDelta.mean(),
        'TreatmentMean': g[~cidx].ScoreDelta.mean(),
    })


dg = deltas.groupby('VisitMonth').apply(ttest_group).reset_index().drop('level_1', axis=1)
dg

Unnamed: 0,VisitMonth,Population,PValue,ControlMean,TreatmentMean
0,0,2235,0.689245,-1.117166,-1.070547
1,1,1179,0.736311,-3.532095,-3.613288
2,2,1044,0.286072,-4.755859,-5.077068
3,3,1501,0.247777,-6.79561,-7.07772
4,4,1175,0.943672,-7.349315,-7.329949
5,5,746,0.222334,-7.815427,-8.232376
6,6,647,0.639652,-8.049689,-8.224615
7,7,535,0.729026,-8.441948,-8.589552
8,8,460,0.971101,-8.365639,-8.381974
9,9,386,0.108624,-8.359375,-9.128866


In [16]:
control = df.TxGroup == 'Control'
treatment = df.TxGroup == 'Treatment'

In [17]:
total_df = df.filter(regex="TxGroup|PANSS_Total")
total_df.head()

Unnamed: 0_level_0,TxGroup,PANSS_Total
AssessmentiD,Unnamed: 1_level_1,Unnamed: 2_level_1
301017,Control,84
308004,Control,80
304775,Control,70
310369,Control,68
313218,Control,65


In [18]:
ttest_results = ttest_ind(total_df.PANSS_Total[treatment], total_df.PANSS_Total[~treatment])
ttest_results.pvalue

0.5955753671304398

In [19]:
dg = df.groupby(['PatientID']).agg('last')
control = dg.TxGroup == 'Control'

var = 'PANSS_Total'
ttest_results = ttest_ind(dg[var][control], dg[var][~control])
ttest_results.pvalue

0.7354508165592581

We find that there is no statistical significance using the combined PANSS total. But is there an effect if we consider each variable separately?

In [163]:
dg = df.groupby(['TxGroup', 'PatientID']).agg('last')
dg.head()

# control_df = dg.get_group('Control')
# tx_df = dg.get_group('Treatment')

# dg = df.groupby('TxGroup').agg('last')
# control = dg.TxGroup == 'Control'

# dg = df.filter('TxGroup|(P|N|G)\d+').groupby(['PatientID']).agg('last')
# dg.head()
# control = dg.TxGroup == 'Control'


# ttest_results = ttest_ind(dg.drop(['TxGroup', 'PANSS_Total'], axis=1), dg.drop(['TxGroup', 'PANSS_Total'], axis=1))


Unnamed: 0_level_0,Unnamed: 1_level_0,Study,Country,SiteID,RaterID,VisitDay,P1,P2,P3,P4,P5,P6,P7,N1,N2,N3,N4,N5,N6,N7,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12,G13,G14,G15,G16,PANSS_Total,LeadStatus
TxGroup,PatientID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
Control,10001,A,USA,20035,30076,67,4,2,4,1,1,4,1,4,3,2,4,4,1,1,3,3,1,2,1,1,2,1,3,2,2,3,3,2,3,4,72,Flagged
Control,10002,A,USA,20011,30016,9,5,5,5,1,2,5,2,4,5,3,4,5,3,3,1,4,1,4,1,4,1,2,5,3,1,3,3,1,3,5,94,Passed
Control,10005,A,USA,20014,30021,93,3,2,3,1,2,3,1,3,3,1,3,4,2,3,1,1,1,1,1,1,3,1,4,1,1,2,1,1,1,3,58,Passed
Control,10010,A,USA,20009,30028,7,5,5,5,3,4,5,1,4,4,3,4,4,3,4,3,3,2,3,3,3,4,3,5,4,4,3,3,1,4,4,106,Passed
Control,10011,A,USA,20032,30038,95,3,5,5,1,1,3,1,3,3,3,4,4,2,4,2,1,2,1,3,1,1,2,3,4,3,6,3,3,5,3,85,Passed


In [60]:
dg = df.groupby('TxGroup')
control_df = dg.get_group('Control')
tx_df = dg.get_group('Treatment')

In [61]:
ttest_results = ttest_ind(control_df.drop(['TxGroup'], axis=1), tx_df.drop(['TxGroup'], axis=1))
ttest_results.pvalue

array([9.64966824e-01, 1.09916279e-02, 9.90728449e-01, 7.40144640e-01,
       1.24770077e-02, 8.16273681e-01, 6.32158292e-01, 1.29835645e-01,
       9.54533873e-01, 3.32178076e-01, 2.95747424e-01, 5.55397104e-01,
       2.92219690e-02, 1.73196594e-01, 2.34599517e-01, 2.82588236e-01,
       1.12584146e-02, 3.83202219e-02, 1.44301469e-04, 7.76533461e-01,
       9.48327442e-01, 2.74904773e-01, 9.25135396e-01, 9.79096428e-01,
       2.59727935e-02, 2.41420283e-01, 1.13304836e-02, 3.26710700e-01,
       2.40418575e-01, 1.39913741e-02])