In [1]:
import csv
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
import statsmodels.api as sm

## Read in data

In [2]:
df = pd.read_csv('../data/v2_BabyStudy2_16jan21.csv', sep=';')

In [3]:
print(f'n obs by start: {len(df)}')

n obs by start: 54


## prepare data

In [4]:
gender_column = ['gender']
brosis_column = ['siblings']
prosocial_columns = [
    'PV_protest_new_yn',
    'PV_tattling_new_yn',
    'PV_prosocial_new_yn',
]
needed_columns = gender_column + brosis_column + prosocial_columns

In [5]:
for c in needed_columns:
    df = df.loc[lambda x: ~x[c].isnull()]
    df = df.loc[lambda x: x[c]!='']
    df = df.loc[lambda x: x[c]!=' ']
    df[c] = df[c].astype(int)

In [6]:
df_select = df[needed_columns].copy()
df_select['constant'] = 1
df_select['siblings_yn'] = 0
df_select.loc[lambda x: x['siblings']>0, 'siblings_yn'] = 1

In [7]:
df.to_csv('../data/v2_BabyStudy2_16jan21_clean.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

In [8]:
print(f'n obs after filer: {len(df_select)}')

n obs after filer: 53


### check 0 ==> general checks

In [9]:
df_select[:5]

Unnamed: 0,gender,siblings,PV_protest_new_yn,PV_tattling_new_yn,PV_prosocial_new_yn,constant,siblings_yn
1,1,3,0,0,0,1,1
2,2,1,0,0,0,1,1
3,1,2,0,0,0,1,1
4,2,1,0,0,0,1,1
5,1,1,0,0,0,1,1


### check 1 ==> gender x pro-social ==> indepedent sample T test

In [11]:
group1_male = df_select.loc[lambda x: x['gender']==1].copy()
group2_female = df_select.loc[lambda x: x['gender']==2].copy()

#### - t test

In [13]:
for p in prosocial_columns:
    print(f'-> {p}:')
    result = ttest_ind(group1_male[p], group2_female[p])
    print(f'==> t-statistic {round(result[1], 4)}')
    print(f'==> P-value {round(result[1], 4)}\n')

-> PV_protest_new_yn:
==> t-statistic 0.8085
==> P-value 0.8085

-> PV_tattling_new_yn:
==> t-statistic 0.2336
==> P-value 0.2336

-> PV_prosocial_new_yn:
==> t-statistic 0.8965
==> P-value 0.8965



#### - chi square test

In [14]:
df_select.groupby(['gender', 'PV_prosocial_new_yn'])['siblings'].count()

gender  PV_prosocial_new_yn
1       0                      27
        1                       5
2       0                      18
        1                       3
Name: siblings, dtype: int64

In [15]:
for p in prosocial_columns:
    print(f'-> {p}:')
    crosstable = pd.crosstab(df_select['gender'], df[p]) 
    c, p, dof, expected = chi2_contingency(crosstable)
    print(f'==> P-value {round(p, 4)}\n')

-> PV_protest_new_yn:
==> P-value 0.9219

-> PV_tattling_new_yn:
==> P-value 0.3682

-> PV_prosocial_new_yn:
==> P-value 0.7956



### check 2 ==> brosis x pro-social ==> logistic regression

In [16]:
dfcheck = df_select.groupby(['siblings', 'PV_prosocial_new_yn'], as_index=False)[['gender']].count()

In [17]:
for s in dfcheck['siblings'].unique():
    dfcheck.loc[lambda x: x['siblings']==s, 'share'] = dfcheck.loc[lambda x: x['siblings']==s]['gender'] / dfcheck.loc[lambda x: x['siblings']==s]['gender'].sum()

In [20]:
for p in prosocial_columns:
    print(f'===============> {p}: ===============> \n')

    X_train = df_select[brosis_column + ['constant']]
    y_train = df_select[[p]] 
    model = sm.Logit(y_train, X_train).fit() 

    print(model.summary())
    
    print('-- odds ratio:')
    df_or = pd.DataFrame(np.exp(model.params), columns=['OD'])
    print(df_or)
    print('')


Optimization terminated successfully.
         Current function value: 0.506217
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:      PV_protest_new_yn   No. Observations:                   53
Model:                          Logit   Df Residuals:                       51
Method:                           MLE   Df Model:                            1
Date:                Tue, 25 May 2021   Pseudo R-squ.:                0.008757
Time:                        14:58:22   Log-Likelihood:                -26.829
converged:                       True   LL-Null:                       -27.066
Covariance Type:            nonrobust   LLR p-value:                    0.4911
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
siblings       0.3041      0.440      0.692      0.489      -0.557       1.166
constant      -1.6992      0

### check 3 ==> moderation brosis -> gender x pro-social ==> PROCESS

In [21]:
from pyprocessmacro import Process

In [22]:
df_select['gender_01'] = 0
df_select.loc[lambda x: x['gender']==2, 'gender_01'] = 1

  and should_run_async(code)


In [23]:
df_select[:5]

  and should_run_async(code)


Unnamed: 0,gender,siblings,PV_protest_new_yn,PV_tattling_new_yn,PV_prosocial_new_yn,constant,siblings_yn,gender_01
1,1,3,0,0,0,1,1,0
2,2,1,0,0,0,1,1,1
3,1,2,0,0,0,1,1,0
4,2,1,0,0,0,1,1,1
5,1,1,0,0,0,1,1,0


In [24]:
for p in prosocial_columns:
    print(f'==> {p}: ==>\n')
    print(' ')
    model = Process(
                data=df_select,
                model=1,
                x="gender_01",
                y=p,
                m=["siblings"],
                logit=True,
                suppr_init=True,
                   )

    print(model.summary())
    print('\n')

==> PV_protest_new_yn: ==>

 

***************************** OUTCOME MODELS ****************************


**************************************************************************
Outcome = PV_protest_new_yn 
Logistic Regression Summary

    -2LL  Model LL  p-value  McFadden  Cox-Snell  Nagelkerke   n
 53.5305    0.6025   0.8958    0.0111     0.0113      0.0177  53

Coefficients

                     coeff     se       Z      p    LLCI    ULCI
Cons               -1.5704 0.7222 -2.1745 0.0297 -2.9858 -0.1550
gender_01          -0.5755 1.6897 -0.3406 0.7334 -3.8872  2.7361
siblings            0.2518 0.4697  0.5362 0.5918 -0.6687  1.1723
gender_01*siblings  0.3394 1.2556  0.2703 0.7869 -2.1215  2.8002

-------------------------------------------------------------------------


********************** CONDITIONAL EFFECTS **********************

Conditional effect(s) of gender_01 on PV_protest_new_yn at values of the moderator(s):

  siblings  Effect     SE       Z      p    LLCI   ULCI
  

  and should_run_async(code)


### check 4 (extra) ==> moderation gender -> brosis x pro-social ==> PROCESS

In [26]:
for p in prosocial_columns:
    print(f'==> {p}: ==>\n')
    print(' ')
    model = Process(data=df_select,
                    model=1,
                    x="siblings",
                    y=p,
                    m=["gender_01"],
                    logit=True,
                    suppr_init=True,)

    print(model.summary())
    print('\n')

==> PV_protest_new_yn: ==>

 

***************************** OUTCOME MODELS ****************************


**************************************************************************
Outcome = PV_protest_new_yn 
Logistic Regression Summary

    -2LL  Model LL  p-value  McFadden  Cox-Snell  Nagelkerke   n
 53.5305    0.6025   0.8958    0.0111     0.0113      0.0177  53

Coefficients

                     coeff     se       Z      p    LLCI    ULCI
Cons               -1.5704 0.7222 -2.1745 0.0297 -2.9858 -0.1550
siblings            0.2518 0.4697  0.5362 0.5918 -0.6687  1.1723
gender_01          -0.5755 1.6897 -0.3406 0.7334 -3.8872  2.7361
siblings*gender_01  0.3394 1.2556  0.2703 0.7869 -2.1215  2.8002

-------------------------------------------------------------------------


********************** CONDITIONAL EFFECTS **********************

Conditional effect(s) of siblings on PV_protest_new_yn at values of the moderator(s):

  gender_01  Effect     SE      Z      p    LLCI   ULCI
   

  and should_run_async(code)
