In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import pyreadstat
import re

In [2]:
fpath = 'data/ATP W42.sav'

df, meta = pyreadstat.read_sav(fpath)

In [3]:
# Used meta.column_names to find questionnaire items with subject matter grouping (e.g. starts with 'RQ')
# Added to lists to use later on

past_future = ['PAST_W42', 'FUTURE_W42']
society = ['SC1_W42']
policy = [i for i in df.columns if 'POLICY' in i]
confidence = [i for i in df.columns if 'CONF' in i]
rq_form1 = [i for i in df.columns if 'RQ' in i]
pw_form2 = [i for i in df.columns if 'PQ' in i]
scm = [i for i in df.columns if 'SCM' in i]
q = [i for i in df.columns if re.search("^Q[0-9]", i)]
pop = [i for i in df.columns if 'POP' in i]
knowledge = [i for i in df.columns if 'KNOW' in i]
demographics = [i for i in df.columns if 'F_' in i]
weight = ['WEIGHT_W42']

In [4]:
# Dictionary of column names (abbreviations) and corresponding column values (instruction on questionnaire)
# Example

meta_dict = dict(zip(meta.column_names, meta.column_labels))

##### Example: 
meta_dict['SC1_W42']
##### Output:
'SC1. Overall, would you say science has had a mostly positive effect on our society or a mostly negative effect on our society?'

In [5]:
df_copy = pyreadstat.pyreadstat.set_value_labels(df, meta)

In [6]:
df_copy['Q6F1_W42'].values

['Trust the research findings MORE', 'Makes NO DIFFERENCE either way', NaN, NaN, 'Refused', ..., NaN, 'Trust the research findings MORE', NaN, 'Trust the research findings MORE', NaN]
Length: 4464
Categories (4, object): ['Makes NO DIFFERENCE either way', 'Refused', 'Trust the research findings LESS', 'Trust the research findings MORE']

In [7]:
df['F_RACETHN'].map(meta.variable_value_labels['F_RACETHN']).value_counts(normalize=True)



White non-Hispanic    0.647625
Hispanic              0.160842
Black non-Hispanic    0.113351
Other                 0.064740
Refused               0.013441
Name: F_RACETHN, dtype: float64

In [8]:
demographics = df.columns[-25:-1]
labels = meta.column_labels[-25:-1]

col_dict = zip(demographics, labels)

for k, v in col_dict:
    print(k + ': ' + v)


F_METRO: Metropolitan area indicator coded from FIPS
F_CREGION: Census region based on self-reported zipcode
F_AGECAT: Age category
F_SEX: Sex
F_EDUCCAT: Education level category
F_EDUCCAT2: Education level category 2
F_HISP: HISP.Are you of Hispanic, Latino, or Spanish origin, such as Mexican, Puerto Rican or Cuban? - Includes RACE backcodes
F_RACECMB: Combining race
F_RACETHN: Race-Ethnicity
F_NATIVITY: NATIVITY. Where were you born?
F_CITIZEN: Citizenship
F_MARITAL: Marital status
F_RELIG: Religion
F_BORN: BORN. Would you describe yourself as a born-again or evangelical Christian, or not?
F_ATTEND: Religious service attendance
F_PARTY_FINAL: Party
F_PARTYLN_FINAL: Party Lean
F_PARTYSUM_FINAL: Party summary
F_INCOME: Family income
F_INCOME_RECODE: Family income recoded
F_REG: Registered voter
F_IDEO: Ideology
F_ACSWEB: Household internet status
F_VOLSUM: Volunteerism status


In [9]:
meta.variable_value_labels['SC1_W42'].values()

dict_values(['Mostly positive', 'Mostly negative', 'Equal positive and negative effects', 'Refused'])

In [10]:
f = lambda x: [print(f'{y} \n') for y in x]

f(meta.column_labels)

QKEY: QKEY 

W42 Device Type 

Language of survey materials 

Wave 42 Form value 

PAST. Compared with twenty years ago, do you think developments in science have made people's lives… 

FUTURE. Looking ahead to the next twenty years, do you think developments in science will make people's lives… 

DOV_FUTURE. Assignment for FUTURE_BOE and FUTURE_WOE questions 

Open-end codes for FUTURE_BOE (first mention) 

Open-end codes for FUTURE_BOE (second mention) 

Open-end codes for FUTURE_BOE (third mention) 

Open-end codes for FUTURE_WOE (first mention) 

Open-end codes for FUTURE_WOE (second mention) 

Open-end codes for FUTURE_WOE (third mention) 

SC1. Overall, would you say science has had a mostly positive effect on our society or a mostly negative effect on our society? 

Open-end codes for SC1POS (first mention) 

Open-end codes for SC1POS (second mention) 

Open-end codes for SC1POS (third mention) 

Open-end codes for SC1NEG (first mention) 

Open-end codes for SC1NEG (second menti

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [11]:
# Accounting for weight in determing response to Q6F1 from FORM 2

df[['Q6F1_W42', 'WEIGHT_W42']].groupby('Q6F1_W42') \
                              .sum() / df.loc[df['FORM_W42'] == 2.0, ['WEIGHT_W42']].sum() * 100

Unnamed: 0_level_0,WEIGHT_W42
Q6F1_W42,Unnamed: 1_level_1
1.0,50.731415
2.0,10.13704
3.0,36.231776
99.0,0.71529


In [12]:
test = pd.crosstab(df['F_IDEO'] \
        .map(meta.variable_value_labels['F_IDEO']), \
        df['SC1_W42'].map(meta.variable_value_labels['SC1_W42']), \
        dropna=True, normalize='index') \
        .loc[meta.variable_value_labels['F_IDEO'].values()] \
        .loc[:,meta.variable_value_labels['SC1_W42'].values()]*100


test2 = pd.crosstab(df['F_AGECAT']. \
            map(meta.variable_value_labels['F_AGECAT']), \
            df['SC1_W42'].map(meta.variable_value_labels['SC1_W42']), 
            df.WEIGHT_W42, aggfunc = sum, dropna=True, \
            normalize='index'). \
            loc[meta.variable_value_labels['F_AGECAT'].values()]. \
            loc[:,meta.variable_value_labels['SC1_W42'].values()]*100

test2

SC1_W42,Mostly positive,Mostly negative,Equal positive and negative effects,Refused
F_AGECAT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18-29,69.898944,3.619753,26.481303,0.0
30-49,74.765393,3.292587,21.942019,0.0
50-64,68.117834,4.831061,26.746482,0.304623
65+,79.321439,1.32652,18.60336,0.748681
DK/REF,100.0,0.0,0.0,0.0


In [13]:
import plotly.express as px

In [14]:
def crosstab_frequency(array1, array2):
    return pd.crosstab(df[array1]. \
                map(meta.variable_value_labels[array1]),
                df[array2].map(meta.variable_value_labels[array2]), 
                df.WEIGHT_W42, aggfunc = sum, dropna=True,
                normalize='index'). \
                loc[meta.variable_value_labels[array1].values()]. \
                loc[:, meta.variable_value_labels[array2].values()]*100

def crosstab_frequency2(array1, array2):
    return pd.crosstab(df_copy[array1],
                       df_copy[array2],
                       df_copy.WEIGHT_W42, aggfunc = sum, dropna=True,
                       normalize='index'). \
                       loc[meta.variable_value_labels[array1].values()]. \
                       loc[:, meta.variable_value_labels[array2].values()]*100

test = crosstab_frequency('F_AGECAT', 'CONFa_W42')
test2 = crosstab_frequency2('F_AGECAT', 'POLICY1_W42')

In [26]:
fig = px.bar(test, x=test.columns, y=test.index )
fig2 = px.bar(test2, x=test2.columns, y=test2.index)

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

In [16]:
fig2.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))