<h1>Data Exploration & Manipulation</h1>

Code to format & explore all data

Haley Johnson

In [5]:
import os 
import pandas as pd
import numpy as np 

In [70]:
pd.options.mode.chained_assignment = None

<h2>Load Data</h2>

In [27]:
os.listdir('../data/raw')

['.DS_Store',
 'best_post_2017_f2f_survey.spss',
 'ons_2021_by_parl_constituency.csv',
 'best_post_2019_rand_prob.spss']

In [37]:
demog = pd.read_csv('../data/raw/ons_2021_by_parl_constituency.csv')

demog.columns = ['constiuency_code', 'constiuency_name', 'age_code', 'age_name', 
              'qual_code', 'qual_name', 'sex_code', 'sex_name', 'n']

In [23]:
survey = pd.read_spss('../data/raw/best_post_2017_f2f_survey.spss')

In [29]:
turnout = pd.read_spss('../data/raw/best_post_2019_rand_prob.spss')

<h2>Data Manipulation</h2>

In [38]:
demog['age_name'].unique()

array(['Aged 15 years and under', 'Aged 16 to 24 years',
       'Aged 25 to 34 years', 'Aged 35 to 49 years',
       'Aged 50 to 64 years', 'Aged 65 years and over'], dtype=object)

<h3>Turnout Data</h3>

In [76]:
target_cols = ['b01', 'b02', 'b04', 'Age', 'y09', 'education', 'edlevel']

turnout_subset = turnout[target_cols]
turnout_subset.columns = ['voted', 'voter_choice', 'non_voter_choice', 'age', 'sex', 'qual', 'qual_level']

<h4>Recode Demographic Variables</h4>

Recode to match levels in stratification table

In [59]:
# see levels for turnout data
turnout_subset['sex'].unique()

['Female', 'Male', 'Prefer not to say', 'Not stated', 'In another way']
Categories (5, object): ['Female', 'In another way', 'Male', 'Not stated', 'Prefer not to say']

In [61]:
# see levels for demographic data
demog['sex_name'].unique()

array(['Female', 'Male'], dtype=object)

In [64]:
# only 37 rows are not labelled male / female 
turnout_subset[~turnout_subset['sex'].isin(['Female', 'Male'])].shape

(37, 8)

<h4>Recode Age Data</h4>

In [66]:
# see levels for turnout data
turnout_subset['age'].unique()

[34.0, 'Refusal', 33.0, 22.0, 35.0, ..., 96.0, 98.0, 99.0, 95.0, 'Not stated']
Length: 84
Categories (84, object): [18.0, 19.0, 20.0, 21.0, ..., 98.0, 99.0, 'Not stated', 'Refusal']

In [67]:
demog['age_name'].unique()

array(['Aged 15 years and under', 'Aged 16 to 24 years',
       'Aged 25 to 34 years', 'Aged 35 to 49 years',
       'Aged 50 to 64 years', 'Aged 65 years and over'], dtype=object)

In [68]:
def recode_age(s):
    if type(s) == str:
        return s
    elif s <= 15:
        return 'Aged 15 years and under'
    elif s <= 24:
        return 'Aged 16 to 24 years'
    elif s <= 34:
        return 'Aged 25 to 34 years'
    elif s <= 49:
        return 'Aged 35 to 49 years'
    elif s <= 64:
        return 'Aged 50 to 64 years'
    else:
        return 'Aged 65 years and over'

In [71]:
turnout_subset['recoded_age'] = turnout_subset['age'].apply(recode_age)

In [73]:
# verify it worked as expected
turnout_subset[['age', 'recoded_age']].sample(10)

Unnamed: 0,age,recoded_age
1458,48.0,Aged 35 to 49 years
2138,55.0,Aged 50 to 64 years
3358,78.0,Aged 65 years and over
206,76.0,Aged 65 years and over
3610,76.0,Aged 65 years and over
1616,Refusal,Refusal
1411,Refusal,Refusal
1071,72.0,Aged 65 years and over
392,38.0,Aged 35 to 49 years
3676,71.0,Aged 65 years and over


<h4>Recode Qualification Dat</h4>

In [75]:
demog['qual_name'].unique()

array(['Does not apply', 'No qualifications',
       'Level 1 and entry level qualifications: 1 to 4 GCSEs grade A* to C, Any GCSEs at other grades, O levels or CSEs (any grades), 1 AS level, NVQ level 1, Foundation GNVQ, Basic or Essential Skills',
       'Level 2 qualifications: 5 or more GCSEs (A* to C or 9 to 4), O levels (passes), CSEs (grade 1), School Certification, 1 A level, 2 to 3 AS levels, VCEs, Intermediate or Higher Diploma, Welsh Baccalaureate Intermediate Diploma, NVQ level 2, Intermediate GNVQ, City and Guilds Craft, BTEC First or General Diploma, RSA Diploma',
       'Level 3 qualifications: 2 or more A levels or VCEs, 4 or more AS levels, Higher School Certificate, Progression or Advanced Diploma, Welsh Baccalaureate Advance Diploma, NVQ level 3; Advanced GNVQ, City and Guilds Advanced Craft, ONC, OND, BTEC National, RSA Advanced Diploma',
       'Level 4 qualifications or above: degree (BA, BSc), higher degree (MA, PhD, PGCE), NVQ level 4 to 5, HNC, HND, RSA Higher 

In [79]:
turnout_subset['qual_level'].unique()

['Undergraduate', NaN, 'Postgrad', 'No qualifications', 'Below GCSE', 'GCSE', 'A-level']
Categories (6, object): ['A-level', 'Below GCSE', 'GCSE', 'No qualifications', 'Postgrad', 'Undergraduate']