## Import Libraries, Define Paths

In [1]:
import pandas as pd
import numpy as np
import os

base = '/Users/jetcalz07/Desktop/MIDS/W241_Experiments/project/'
data = base + 'data/'

## Load Data

In [2]:
# Filter columns from extract, rename
keep_cols = ['IPAddress', 'Random ID', 'Q26', 'Q2', 'Q27', 'Q28', 'Q3', 'Q4', 'Q7', 'Q8', 'Q9', 'Q10', 
'Message_Received', 'Shelter_Information', 'Shelter_Type_1', 'Shelter_Type_2', 'Shelter_Type_3', 'Shelter_Type_4', 
'Small_1', 'Small_2', 'Small_3', 'Small_4', 'Medium_1', 'Medium_2', 'Medium_3', 'Medium_4', 
'Large_1', 'Large_2', 'Large_3', 'Large_4', 'Duration (in seconds)']
rename_cols = ['ip', 'rid', 'rid_pasted', 'age_bin', 'gender', 'marital', 'income_bin', 'state',
'own_dog', 'dog_or_cat', 'adopt_or_shop', 'dog_size',  'msg_treat_ind', 'shelter_treat_bin',
'shelter_tp_1', 'shelter_tp_2', 'shelter_tp_3', 'shelter_tp_4', 'sm_1', 'sm_2', 'sm_3', 'sm_4',
'med_1', 'med_2', 'med_3', 'med_4', 'lg_1', 'lg_2', 'lg_3', 'lg_4', 'duration']
trim_cols =  ['ip', 'rid', 'rid_match_qualtrics', 'age_bin', 'gender', 'marital', 'income_bin', 'state',
'own_dog', 'dog_or_cat', 'adopt_or_shop', 'dog_size',  'msg_treat_ind', 'shelter_treat_bin', 'duration']
desired_cols =  trim_cols + ['dog_num', 'rating', 'shelter_tp']

In [3]:
# Load relevant columns from Qualtrics output, trim fat, rename
fname = 'DOG ADOPTION SURVEY_November 19, 2022_09.38.csv'
df_init = pd.read_csv(data+fname, usecols=keep_cols)
df_init.rename(columns=dict(zip(keep_cols, rename_cols)), inplace=True)
df_init = df_init.iloc[2:, ]
df_init.reset_index(inplace=True, drop=True)
df_init.reset_index(level = 0,inplace=True)
print(f"No. participants: {len(df_init)}")
df_init.head()

No. participants: 1


Unnamed: 0,index,ip,duration,age_bin,gender,marital,income_bin,state,own_dog,dog_or_cat,...,lg_3,lg_4,rid_pasted,rid,msg_treat_ind,shelter_treat_bin,shelter_tp_1,shelter_tp_2,shelter_tp_3,shelter_tp_4
0,0,174.234.0.95,54,Under 18 years,Female,Single,"$25,000-$49,999",Hawaii,"No, but I own another pet",Cats,...,8,3,TEST,690005,Yes,Yes,Traditional Shelter,No-Kill Shelter,Traditional Shelter,No-Kill Shelter


In [4]:
## Create features at participant level
# 1. Check if rids match within Qualtrics (not essential but may help)
df_init['rid_match_qualtrics'] = df_init['rid']==df_init['rid_pasted']

## Wrangle Data

In [5]:
# Init DF
df = pd.DataFrame(columns=desired_cols)

# Loop through rows
idx = 0
for idx, row in df_init.iterrows():
    base_row = row[trim_cols].tolist()

    for i in range(1, 5):
        sizes = row[[f'sm_{i}', f'med_{i}', f'lg_{i}']]
        rating = int(sizes[~sizes.isnull()][0])
        if row['shelter_treat_bin'] == 'No':
            shelter_tp = None
        else:
            shelter_tp = row[f'shelter_tp_{i}']
        new_row = base_row.copy() + [i, rating, shelter_tp]

        # save out
        df.loc[len(df), ] = new_row

df.head()

Unnamed: 0,ip,rid,rid_match_qualtrics,age_bin,gender,marital,income_bin,state,own_dog,dog_or_cat,adopt_or_shop,dog_size,msg_treat_ind,shelter_treat_bin,duration,dog_num,rating,shelter_tp
0,174.234.0.95,690005,False,Under 18 years,Female,Single,"$25,000-$49,999",Hawaii,"No, but I own another pet",Cats,I will adopt a dog,Large (Greater than 60 lbs.),Yes,Yes,54,1,7,Traditional Shelter
1,174.234.0.95,690005,False,Under 18 years,Female,Single,"$25,000-$49,999",Hawaii,"No, but I own another pet",Cats,I will adopt a dog,Large (Greater than 60 lbs.),Yes,Yes,54,2,8,No-Kill Shelter
2,174.234.0.95,690005,False,Under 18 years,Female,Single,"$25,000-$49,999",Hawaii,"No, but I own another pet",Cats,I will adopt a dog,Large (Greater than 60 lbs.),Yes,Yes,54,3,8,Traditional Shelter
3,174.234.0.95,690005,False,Under 18 years,Female,Single,"$25,000-$49,999",Hawaii,"No, but I own another pet",Cats,I will adopt a dog,Large (Greater than 60 lbs.),Yes,Yes,54,4,3,No-Kill Shelter


In [6]:
## Create feature at participant-dog level

# We have two treatments: msg_treat_ind and shelter_tp
# Need to make shelter_tp have 3 categories: (None, Trad, NK)
# implemented above but need to test on actual example

## Convert Data Types

In [7]:
# Convert categorical to dummy
pd.get_dummies(df_init['own_dog'])

Unnamed: 0,"No, but I own another pet"
0,1


### Analysis Notes
- Run LR on the full dataset (Create interaction terms with treatments)
- Key areas to investigate
    - What had the greatest effect on ratings?
    - Did the message treatment alone increase ratings?
    - Did the shelter tp treatment alone increase?
    - Did the interaction of treatments increase? Which was strongest?
    - Which covariates were most impactful?
        - Ex: did the survey completion time effect ratings?
- Run LR on subsets of features to get a set of models 
- Run F-Test acros models to see which models are most predictive/significant