In [None]:
# Dependencies
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Read the dataset into a dataframe
df_us_20208 = pd.read_csv('Resources/us202208.csv')

df_us_20208

## Part 1 : Translating Data Frame 

In [None]:
# # Rename the columns needed
df_us_20208_summary = df_us_20208.rename(columns={
                                        "D1_A": "AGE",
                                        "D1_G": "GENDER",
                                        "VAXANY": "VACCINE STATUS",
                                        "L7": "TRUST",
                                        "Xp6": "POS-CASES",
                                        "CVAX4": "TRUST GOV",
                                        "D0": "HEALTHY",
                                        "D2": "STATE",
                                        "D3": "RURAL",
                                        "D4": "INCOME",
                                        "D5_E": "HISPANIC",
                                        "D5_M": "RACE",
                                        "D6": "RELIGION",
                                        "D6_E": "RELIGIOUS",
                                        "D7": "HEALTH COVERAGE",
                                        "D8": "POLITICS",
                                        "D9": "WORK STATUS",
                                        "D13": "ILLNESS",
                                        "D15": "EDUCATION",
                                         })

df_us_20208_summary



In [None]:
# Organize the data to only show the columns needed
df_organized_us_20208 = df_us_20208_summary[["AGE", "GENDER", "VACCINE STATUS", "TRUST", "POS-CASES", "TRUST GOV", "HEALTHY", "STATE", "RURAL", "INCOME", "HISPANIC", "RACE", "RELIGION", "RELIGIOUS", "HEALTH COVERAGE", "POLITICS", "WORK STATUS", "ILLNESS", "EDUCATION"]]
df_organized_us_20208


In [None]:
# Dictionary mapping numerical values to gender labels
numerical_to_gender = {1: 'Male', 2: 'Female', 3: 'Other'}

# Replace numerical values with gender labels in the GENDER column
df_organized_us_20208.loc[:, 'GENDER'] = df_organized_us_20208['GENDER'].map(numerical_to_gender)
df_organized_us_20208

# Dictionary mapping numerical values to vaccine status labels
numerical_to_vaccine_status = {1: 'Yes', 2: 'No'}

# Replace numerical values with vaccine status labels in the VACCINE STATUS column
df_organized_us_20208.loc[:, 'VACCINE STATUS'] = df_organized_us_20208['VACCINE STATUS'].map(numerical_to_vaccine_status)
df_organized_us_20208

# Dictionary mapping numerical values to trust in vaccines labels
numerical_to_trust_vaccine = {'1': 'Strongly disagree', '2': 'Disagree', '3': 'Neither agree nor disagree', '4': 'Agree', '5': 'Strongly agree', '.': 'n/a'}

# Replace numerical values with trust in vaccine labels in the TRUST column
df_organized_us_20208.loc[:, 'TRUST'] = df_organized_us_20208['TRUST'].astype(str).map(numerical_to_trust_vaccine)
df_organized_us_20208

# Dictionary mapping numerical values to positive cases labels
numerical_to_pos_cases= {'1':'Once', '2': 'Twice', '3': 'Three or more', '.': 'None'}

# Replace numerical values with positive cases labels in the POS-CASES column
df_organized_us_20208.loc[:,'POS-CASES'] = df_organized_us_20208['POS-CASES'].astype(str).map(numerical_to_pos_cases)

# Dictionary mapping numerical values to trust in government labels
numerical_to_trust_gov = {'1': 'I have not changed how much I trust them', '2': 'Now trust them more', 3: 'I now trust them less', '.': 'NA'}

# Replace numerical values with trust in government labels in the TRUST GOV column
df_organized_us_20208.loc[:,'TRUST GOV'] = df_organized_us_20208['TRUST GOV'].astype(str).map(numerical_to_trust_gov)

# Dictionary mapping numerical values to health labels
numerical_to_healthy = {'1': 'Very good', '2': 'Good', '3': 'Fair',  '4': 'Poor', '5': 'Very Poor', '.': 'NA'}

# Replace numerical values with health labels in the HEALTHY column
df_organized_us_20208.loc[:,'HEALTHY'] = df_organized_us_20208['HEALTHY'].astype(str).map(numerical_to_healthy)

# Dictionary mapping numerical values to state labels
numerical_to_state = {
                            '1': 'Alabama',
                            '2': 'Alaska',
                            '3': 'Arizona',
                            '4': 'Arkansas',
                            '5': 'California',
                            '6': 'Colorado',
                            '7': 'Connecticut',
                            '8': 'Delaware',
                            '9': 'District of Columbia',
                            '10': 'Florida',
                            '11': 'Georgia',
                            '12': 'Hawaii',
                            '13': 'Idaho',
                            '14': 'Illinois',
                            '15': 'Indiana',
                            '16': 'Iowa',
                            '17': 'Kansas',
                            '18': 'Kentucky',
                            '19': 'Louisiana',
                            '20': 'Maine',
                            '21': 'Maryland',
                            '22': 'Massachusetts',
                            '23': 'Michigan',
                            '24': 'Minnesota',
                            '25': 'Mississippi',
                            '26': 'Missouri',
                            '27': 'Montana',
                            '28': 'Nebraska',
                            '29': 'Nevada',
                            '30': 'New Hampshire',
                            '31': 'New Jersey',
                            '32': 'New Mexico',
                            '33': 'New York',
                            '34': 'North Carolina',
                            '35': 'North Dakota',
                            '36': 'Ohio',
                            '37': 'Oklahoma',
                            '38': 'Oregon',
                            '39': 'Pennsylvania',
                            '40': 'Rhode Island',
                            '41': 'South Carolina',
                            '42': 'South Dakota',
                            '43': 'Tennessee',
                            '44': 'Texas',
                            '45': 'Utah',
                            '46': 'Vermont',
                            '47': 'Virginia',
                            '48': 'Washington',
                            '49': 'West Virginia',
                            '50': 'Wisconsin',
                            '51': 'Wyoming',
                            '52':'American Samoa',
                            '53':'Guam',
                            '54':'Northern Mariana Islands',
                            '55':'Puerto Rico',
                            '56':'Virgin Islands',
                            '.': 'NA'
}
# Replace numerical values with state labels in the STATE column
df_organized_us_20208.loc[:,'STATE'] = df_organized_us_20208['STATE'].astype(str).map(numerical_to_state)

# Dictionary mapping numerical values to rural status labels
numerical_to_rural = {1: 'Yes', 2: 'No'}

# Replace numerical values with rural status labels in the RURAL column
df_organized_us_20208.loc[:,'RURAL'] = df_organized_us_20208['RURAL'].map(numerical_to_rural)

# Dictionary mapping numerical values to income status labels
numerical_to_income = {
    '1': 'less than $12,000 a year',
    '2': '$12,000-$23,999 a year',
    '3': '$24,000-$35,999 a year',
    '4': '$36,000-$59,999 a year',
    '5': '$60,000-$95,999 a year',
    '6': '$96,000-$119,999 a year',
    '7': '$120,000-$155,999 a year',
    '8': '$156,000 a year or more'
    }

# Replace numerical values with income labels in the INCOME column
df_organized_us_20208.loc[:,'INCOME'] = df_organized_us_20208['INCOME'].astype(str).map(numerical_to_income)

# Dictionary mapping numerical values to Hispanic status labels
numerical_to_hispanic = {'1': 'Yes', '2': 'No'}

# Replace numerical values with Hispanic state labels in the HISPANIC column
df_organized_us_20208.loc[:, 'HISPANIC'] = df_organized_us_20208['HISPANIC'].astype(str).map(numerical_to_hispanic)

# Dictionary mapping numerical values to Race
numerical_to_race = {'1': 'White', '2': 'Black or African American', '3': 'American Indian or Alaska Native', '4': 'Asian', '5': 'Pacific Islander', '.': 'N/A'}

# Replace numerical values with race labels in the RACE column
df_organized_us_20208.loc[:,'RACE'] = df_organized_us_20208['RACE'].astype(str).map(numerical_to_race)

# Dictionary mapping numerical values to Religion
numerical_to_religion = {'1': 'Protestant', '2': 'Roman Catholic', '3': 'Mormon', '4': 'Orthodox (such as Greek or Russian Orthodox)', '5': 'Other or nondenominational Christian','6': 'Jewish', '7': 'Muslim', '8': 'Buddhist', '9': 'Hindu', '10': 'Atheist', '11': 'Agnostic', '12': 'Something else', '13': 'Nothing in particular'}

# Replace numerical values with religion labels in the RELIGION column
df_organized_us_20208.loc[:,'RELIGION'] = df_organized_us_20208['RELIGION'].astype(str).map(numerical_to_religion)

# Dictionary mapping numerical values to religious status labels
numerical_to_religious = {'1': 'Yes', '2': 'No', '.': 'N/A'}

# Replace numerical values with religious status labels in the RELIGIOUS column
df_organized_us_20208.loc[:,'RELIGIOUS'] = df_organized_us_20208['RELIGIOUS'].astype(str).map(numerical_to_religious)

# Dictionary mapping numerical values to health coverage labels
numerical_to_health_coverage = {'1': 'A plan purchased through an employer or union', '2': 'A plan that you or another family member buys on your own', '3': 'Medicare', '4': 'Medicaid or other state program', '5': 'TRICARE (formerly CHAMPUS), VA, or Military', '6': 'Alaska Native, Indian Health Service, Tribal Health Services', '7': 'Some other source', '.': 'N/A'}

# Replace numerical values with health coverage labels in the HEALTH COVERAGE column using .loc
df_organized_us_20208.loc[:,'HEALTH COVERAGE'] = df_organized_us_20208['HEALTH COVERAGE'].astype(str).map(numerical_to_health_coverage)

#Dictionary mapping numerical values to politics labels
numerical_to_politics = {1: 'Democrat', 2: 'Independent', 3: 'Republican'}

# Replace numerical values with politics labels in the POLITICS column
df_organized_us_20208.loc[:, 'POLITICS'] = df_organized_us_20208['POLITICS'].map(numerical_to_politics)

#Dictionary mapping numerical values to work status labels
numerical_to_work_status = {1: 'Employed for wages', 2: 'Self-employed', 3: 'Out of work for 1 yr or more', 4: 'Out of work for less than 1 yr', 5: 'A Homemaker', 6: 'A Student', 7: 'Retired', 8: 'Unable to work'}

# Replace numerical values with work status labels in the WORK STATUS column
df_organized_us_20208.loc[:, 'WORK STATUS'] = df_organized_us_20208['WORK STATUS'].map(numerical_to_work_status)

#Dictionary mapping numerical values to illness labels
numerical_to_illness = {1: 'Yes', 2: 'No', 3: 'Dont know'}

# Replace numerical values with illness labels in the ILLNESS column
df_organized_us_20208.loc[:, 'ILLNESS'] = df_organized_us_20208['ILLNESS'].map(numerical_to_illness)

#Dictionary mapping numerical values to education labels
numerical_to_education = {
    '1': 'Never attended school',
    '2': 'Grades 1 through 8',
    '3':'Grades 9 through 11',
    '4': 'Grade 12 or GED',
    '5': 'Some college, Associate’s, or Technical Degree',
    '6': 'Bachelor’s Degree',
    '7': 'Any post graduate studies'}

# Replace numerical values with education labels in the education column using .loc
df_organized_us_20208.loc[:, 'EDUCATION'] = df_organized_us_20208['EDUCATION'].astype(str).map(numerical_to_education)

df_organized_us_20208

## Part 2: Filtering DF to determine variables that matter the most to make a decision to take a vaccine

In [None]:
# Show how many people in each state were interviewed
STATE_C = df_organized_us_20208['STATE'].value_counts()
STATE_C

In [None]:
# Count the total distribution for the following
POLITICS = df_organized_us_20208['POLITICS'].value_counts()
RELIGION = df_organized_us_20208['RELIGION'].value_counts()
TRUS  = df_organized_us_20208['TRUST GOV'].value_counts()
EDUCATION = df_organized_us_20208['EDUCATION'].value_counts()
GENDER = df_organized_us_20208['GENDER'].value_counts()
RACE = df_organized_us_20208['RACE'].value_counts()

GENDER, POLITICS, TRUS, RELIGION, RACE, EDUCATION


In [None]:
# Show the dataframe with only the vaccinated
vaccinated_people_filtered = df_organized_us_20208.loc[(df_organized_us_20208['VACCINE STATUS'].str.contains("Yes"))]

vaccinated_people_filtered 

In [None]:
# Show how many people in each state were vaccinated
STATE_Y = vaccinated_people_filtered ['STATE'].value_counts().sort_values(ascending=False)
STATE_Y


In [None]:
# Show how many interviewed in each category of education
EDUCATION_T = df_organized_us_20208['EDUCATION'].value_counts()

print(EDUCATION_T)

In [None]:
# Show how many vaccinated in each category of education
vaccinated_people_filtered ['EDUCATION'].value_counts().sort_values(ascending=False)

In [None]:
# Show how many vaccinated in each of the following
POLITICS_Y = vaccinated_people_filtered ['POLITICS'].value_counts()
RELIGION_Y = vaccinated_people_filtered ['RELIGION'].value_counts()
TRUS_Y  = vaccinated_people_filtered ['TRUST GOV'].value_counts()
RACE_Y  = vaccinated_people_filtered ['RACE'].value_counts()

POLITICS_Y, TRUS_Y, RELIGION_Y, RACE_Y

In [None]:
# Show how many vaccinated for each age
AGE_Y  = vaccinated_people_filtered ['AGE'].value_counts()
AGE_Y.head(40)

In [None]:
# Show how many vaccinated for each gender 
GENDER  = vaccinated_people_filtered ['GENDER'].value_counts()
GENDER

## Part 3 : Ploting the most representative variables

In [None]:
# How many people interviewed in each state
STATE_C_table = STATE_C.reset_index()
# Rename the columns
STATE_C_table.columns = ["STATE", "COUNT"]
# Display the resulting DataFrame
print(STATE_C_table)

In [None]:
#Create bar table and characteristics 
data_state_dist = STATE_C_table.plot(
    kind='bar',
    x='STATE',
    y='COUNT',
    figsize=(10, 8),
    color='cadetblue',
    title='Data Distribution by State'
)
data_state_dist.set_xlabel('State')

data_state_dist.set_xticklabels(data_state_dist.get_xticklabels(), rotation=45, ha='right')


#display the plot
plt.show()


In [None]:
# How many people interviewed for each race
RACE_table = RACE.reset_index()
# Rename the columns
RACE_table.columns = ["RACE", "COUNT"]
# Display the resulting DataFrame
print(RACE_table)

In [None]:
#Create bar table and characteristics 
data_state_dist = RACE_table.plot(
    kind='bar',
    x='RACE',
    y='COUNT',
    figsize=(6, 4),
    color='cadetblue',
    title='Data Distribution by RACE'
)
data_state_dist.set_xlabel('State')

data_state_dist.set_xticklabels(data_state_dist.get_xticklabels(), rotation=45, ha='right')


#display the plot
plt.show()

In [None]:
# Vaccine status for rural residence 
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['RURAL'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
contingency_table.plot(kind='bar', stacked=True)

# this for loop is to add numbers to the bar sections. I thought that might be fancier
for i, (index, row) in enumerate(contingency_table.iterrows()):
    cum_value = 0
    for col in contingency_table.columns:
        value = row[col]
        plt.text(i, cum_value + value / 2, int(value), ha='center', va='center')
        cum_value += value

plt.title('Vaccine Status by Rural Residence')
plt.xlabel('Rural Residence')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()

In [None]:
# Vaccine status by political affiliation 
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['POLITICS'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
politics_vaccine_table = pd.crosstab(df_organized_us_20208['POLITICS'], df_organized_us_20208['VACCINE STATUS'])

# Plot the stacked bar chart
ax = politics_vaccine_table.plot(kind='bar', stacked=True, figsize=(10, 7))

# Annotate value labels to each section of the stacked bars
for c in ax.containers:
    # Optional: if the segment is too small, don't label it
    labels = [v.get_height() if v.get_height() > 10 else '' for v in c]
    ax.bar_label(c, label_type='center', labels=labels)

plt.title('Vaccine Status by Political Affiliation')
plt.xlabel('Political Affiliation')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()

In [None]:
# Calculate the total number of respondents for each political affiliation
total_responses = df_organized_us_20208.groupby('POLITICS')['VACCINE STATUS'].count()

# Calculate the number of "Yes" responses for each political affiliation
yes_responses = df_organized_us_20208[df_organized_us_20208['VACCINE STATUS'] == 'Yes'].groupby('POLITICS')['VACCINE STATUS'].count()

# Calculate the percentages of "Yes" responses
percentages_yes = (yes_responses / total_responses) * 100

# Display the percentages of "Yes" responses
print("Percentages of Yes responses by political affiliation:")
print(percentages_yes)

In [None]:
# Vaccination status by gender
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['GENDER'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
gender_vaccine_table = pd.crosstab(df_organized_us_20208['GENDER'], df_organized_us_20208['VACCINE STATUS'])

# Plot the stacked bar chart
ax = gender_vaccine_table.plot(kind='bar', stacked=True, figsize=(10, 7))

# Annotate value labels to each section of the stacked bars
for c in ax.containers:
    # Optional: if the segment is too small, don't label it
    labels = [v.get_height() if v.get_height() > 10 else '' for v in c]
    ax.bar_label(c, label_type='center', labels=labels)

plt.title('Vaccine Status by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()

In [None]:
# Vaccine status by education 
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['EDUCATION'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
education_vaccine_table = pd.crosstab(df_organized_us_20208['EDUCATION'], df_organized_us_20208['VACCINE STATUS'])

# Plot the stacked bar chart
ax = education_vaccine_table.plot(kind='bar', stacked=True, figsize=(10, 7))

# Annotate value labels to each section of the stacked bars
for c in ax.containers:
    # Optional: if the segment is too small, don't label it
    labels = [v.get_height() if v.get_height() > 10 else '' for v in c]
    ax.bar_label(c, label_type='center', labels=labels)

plt.title('Vaccine Status by Education')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()

In [None]:
# Calculate percentages
education_vaccine_percentages = (education_vaccine_table['Yes'] / education_vaccine_table.sum(axis=1)) * 100

# Plot the percentages
education_vaccine_percentages.plot(kind='bar', color='skyblue', figsize=(10, 6))

# Set the title and labels
plt.title('Percentage of Vaccination Status by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Percentage of "Yes" responses')

# Show the plot
plt.show()

In [None]:
# Vaccine status by religion
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['RELIGION'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
# Calculate the total number of respondents for each religion
total_responses_rel = df_organized_us_20208.groupby('RELIGION')['VACCINE STATUS'].count()

# Calculate the number of "Yes" responses for each religion
yes_responses_rel = df_organized_us_20208[df_organized_us_20208['VACCINE STATUS'] == 'Yes'].groupby('RELIGION')['VACCINE STATUS'].count()

# Calculate the percentages of "Yes" responses
percentages_yes_rel = (yes_responses_rel / total_responses_rel) * 100

# Display the percentages of "Yes" responses by religion
print("Percentages of Yes responses by religion:")
print(percentages_yes_rel)

In [None]:
education_vaccine_table = pd.crosstab(df_organized_us_20208['RELIGION'], df_organized_us_20208['VACCINE STATUS'])

# Plot the stacked bar chart
ax = education_vaccine_table.plot(kind='bar', stacked=True, figsize=(10, 7))

# Annotate value labels to each section of the stacked bars
for c in ax.containers:
    # Optional: if the segment is too small, don't label it
    labels = [v.get_height() if v.get_height() > 10 else '' for v in c]
    ax.bar_label(c, label_type='center', labels=labels)

plt.title('Vaccine Status by Religion')
plt.xlabel('Religion')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()

In [None]:
# Vaccination status by state
from scipy.stats import pearsonr, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
contingency_table = pd.crosstab(df_organized_us_20208['STATE'], df_organized_us_20208['VACCINE STATUS'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(contingency_table)

In [None]:
state_vaccine_table = pd.crosstab(df_organized_us_20208['STATE'], df_organized_us_20208['VACCINE STATUS'])

# Plot the stacked bar chart
ax = state_vaccine_table.plot(kind='bar', stacked=True, figsize=(10, 7))

# Annotate value labels to each section of the stacked bars
for c in ax.containers:
    # Optional: if the segment is too small, don't label it
    labels = [v.get_height() if v.get_height() > 10 else '' for v in c]
    ax.bar_label(c, label_type='center', labels=labels)

plt.title('Vaccine Status by State')
plt.xlabel('State')
plt.ylabel('Count')
plt.legend(title='Vaccine Status')
plt.show()