In [29]:
# Import packages
import pandas as pd
from scipy.stats import chi2_contingency

# Import data
data = pd.read_csv('candidates_original_preprocessed.csv')

# Chi-squared tests for Hired vs. Not hired

Chi-squared tests to test whether sex/age/region are related to being hired. 

In [30]:
# List of variables to analyze
variables = ['Sex', 'Age Range', 'Region']

# Initialize a dictionary to store results
chi2_results = {}

# Loop through each variable
for variable in variables:
    print(f"\n--- Analysis for {variable} ---")
    
    # Create the contingency table with totals
    contingency_table = pd.crosstab(
        data[variable],
        data['Candidate State'].apply(lambda x: 'Hired' if x == 'Hired' else 'Other'),
        margins=True  # Add totals for rows and columns
    )
    
    # Perform the Chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Store the results
    chi2_results[variable] = {
        'Contingency Table': contingency_table,
        'Chi2': chi2,
        'p-value': p,
        'Degrees of Freedom': dof,
        'Expected Frequencies': expected
    }
    
    # Display the contingency table
    print("Contingency Table:\n", contingency_table)
    
    # Display the Chi-square test results
    print("Chi-Square Test Statistic:", chi2)
    print("p-value:", p)
    print("Degrees of Freedom:", dof)
#     print("Expected Frequencies:\n", expected)
    
    # Determine significance
    alpha = 0.05  # Significance level
    if p < alpha:
        print(f"Conclusion: The variable '{variable}' and a person being hired or not, are likely to be dependent (there is a significant association).")
    else:
        print(f"Conclusion: The variable '{variable}' and a person being hired or not, are likely to be independent (no significant association).")



--- Analysis for Sex ---
Contingency Table:
 Candidate State  Hired  Other    All
Sex                                 
Female             130   2694   2824
Male               338   9070   9408
All                468  11764  12232
Chi-Square Test Statistic: 6.029950636346644
p-value: 0.19692267929101923
Degrees of Freedom: 4
Conclusion: The variable 'Sex' and a person being hired or not, are likely to be independent (no significant association).

--- Analysis for Age Range ---
Contingency Table:
 Candidate State  Hired  Other    All
Age Range                           
20 - 25 years       30   3284   3314
26 - 30 years      168   5550   5718
31 - 35 years       90   1021   1111
36 - 40 years       49    384    433
40 - 45 years       33    200    233
< 20 years          45    877    922
> 45 years          53    448    501
All                468  11764  12232
Chi-Square Test Statistic: 342.81644326571313
p-value: 1.3200582257954743e-64
Degrees of Freedom: 14
Conclusion: The variable 'A

For 'Region', the #observations is <5 in for some. This does not adhere with the rules of thumb of the Chi-squared test, this needs to be accounted for still. 

# Chi-squared tests for various candidates states

## Candidate state vs. Sex

In [26]:
# Order of selection
selection_order = ['Imported', 'In selection', 'First contact', 'QM', 'Vivier', 'Economic proposal', 'Hired']

# Initialize a dictionary
contingency_tables_sex = {}

# Loop through each state in the process order
for i, state in enumerate(selection_order):
    post_states = selection_order[i+1:]
    
    if post_states:   # Skips hired as it has no states after
        contingency_table_sex = pd.DataFrame({
            f'Post {state}': data[data['Candidate State'].isin(post_states)].groupby('Sex').size(),
            state: data[data['Candidate State'] == state].groupby('Sex').size()
        })
        contingency_tables_sex[state] = contingency_table_sex

In [20]:

# Dictionary to store Chi-square test results
chi2_results = {}

# Perform Chi-square test for each contingency table
for state, table in contingency_tables_sex.items():
    # Drop rows with missing values, if any, to ensure valid input
    table = table.fillna(0).astype(int)
    
    # Perform the Chi-square test
    chi2, p, dof, expected = chi2_contingency(table)
    
    # Store the results
    chi2_results[state] = {
        'Chi2': chi2,
        'p-value': p,
        'Degrees of Freedom': dof,
        'Expected Frequencies': expected
    }
    
    # Print results for the current state
    print(f"\n--- Analysis for {state} ---")
    print(f"Chi-Square Test Statistic: {chi2}")
    print(f"p-value: {p}")
    print(f"Degrees of Freedom: {dof}")
#     print(f"Expected Frequencies:\n{expected}")
    
    alpha = 0.05
    if p < alpha:
        print(f"Conclusion: For '{state}', the variables 'Sex' and 'Candidate State' are likely to be dependent (significant association).")
    else:
        print(f"Conclusion: For '{state}', the variables 'Sex' and 'Candidate State' are likely to be independent (no significant association).")


--- Analysis for Imported ---
Chi-Square Test Statistic: 63.66134178926101
p-value: 1.477558472913691e-15
Degrees of Freedom: 1
Conclusion: For 'Imported', the variables 'Sex' and 'Candidate State' are dependent (significant association).

--- Analysis for In selection ---
Chi-Square Test Statistic: 4.409181101166797
p-value: 0.03574599691749415
Degrees of Freedom: 1
Conclusion: For 'In selection', the variables 'Sex' and 'Candidate State' are dependent (significant association).

--- Analysis for First contact ---
Chi-Square Test Statistic: 4.834975066237263
p-value: 0.02788804410371374
Degrees of Freedom: 1
Conclusion: For 'First contact', the variables 'Sex' and 'Candidate State' are dependent (significant association).

--- Analysis for QM ---
Chi-Square Test Statistic: 10.565197261191749
p-value: 0.0011523673663921489
Degrees of Freedom: 1
Conclusion: For 'QM', the variables 'Sex' and 'Candidate State' are dependent (significant association).

--- Analysis for Vivier ---
Chi-Squa

## Candidate state vs. Age Range

In [22]:
# Order of selection
selection_order = ['Imported', 'In selection', 'First contact', 'QM', 'Vivier', 'Economic proposal', 'Hired']

# Initialize a dictionary
contingency_tables_age = {}

# Loop through each state in the process order
for i, state in enumerate(selection_order):
    post_states = selection_order[i+1:]
    
    if post_states:   # Skips hired as it has no states after
        contingency_table_age = pd.DataFrame({
            f'Post {state}': data[data['Candidate State'].isin(post_states)].groupby('Age Range').size(),
            state: data[data['Candidate State'] == state].groupby('Age Range').size()
        })
        contingency_tables_age[state] = contingency_table_age

In [23]:

# Dictionary to store Chi-square test results
chi2_results = {}

# Perform Chi-square test for each contingency table
for state, table in contingency_tables_age.items():
    # Drop rows with missing values, if any, to ensure valid input
    table = table.fillna(0).astype(int)
    
    # Perform the Chi-square test
    chi2, p, dof, expected = chi2_contingency(table)
    
    # Store the results
    chi2_results[state] = {
        'Chi2': chi2,
        'p-value': p,
        'Degrees of Freedom': dof,
        'Expected Frequencies': expected
    }
    
    # Print results for the current state
    print(f"\n--- Analysis for {state} ---")
    print(f"Chi-Square Test Statistic: {chi2}")
    print(f"p-value: {p}")
    print(f"Degrees of Freedom: {dof}")
#     print(f"Expected Frequencies:\n{expected}")
    
    alpha = 0.05
    if p < alpha:
        print(f"Conclusion: For '{state}', the variables 'Age Range' and 'Candidate State' are likely to be dependent (significant association).")
    else:
        print(f"Conclusion: For '{state}', the variables 'Age Range' and 'Candidate State' are likely to be independent (no significant association).")


--- Analysis for Imported ---
Chi-Square Test Statistic: 3259.7363634241924
p-value: 0.0
Degrees of Freedom: 6
Conclusion: For 'Imported', the variables 'Age Range' and 'Candidate State' are dependent (significant association).

--- Analysis for In selection ---
Chi-Square Test Statistic: 16.396566428031583
p-value: 0.011776528708639615
Degrees of Freedom: 6
Conclusion: For 'In selection', the variables 'Age Range' and 'Candidate State' are dependent (significant association).

--- Analysis for First contact ---
Chi-Square Test Statistic: 92.78203768768347
p-value: 8.001878080863548e-18
Degrees of Freedom: 6
Conclusion: For 'First contact', the variables 'Age Range' and 'Candidate State' are dependent (significant association).

--- Analysis for QM ---
Chi-Square Test Statistic: 15.833494766518513
p-value: 0.014676125266923598
Degrees of Freedom: 6
Conclusion: For 'QM', the variables 'Age Range' and 'Candidate State' are dependent (significant association).

--- Analysis for Vivier --

# Candidate state vs. Region

In [24]:
# Order of selection
selection_order = ['Imported', 'In selection', 'First contact', 'QM', 'Vivier', 'Economic proposal', 'Hired']

# Initialize a dictionary
contingency_tables_region = {}

# Loop through each state in the process order
for i, state in enumerate(selection_order):
    post_states = selection_order[i+1:]
    
    if post_states:   # Skips hired as it has no states after
        contingency_table_region = pd.DataFrame({
            f'Post {state}': data[data['Candidate State'].isin(post_states)].groupby('Region').size(),
            state: data[data['Candidate State'] == state].groupby('Region').size()
        })
        contingency_tables_region[state] = contingency_table_region

In [25]:

# Dictionary to store Chi-square test results
chi2_results = {}

# Perform Chi-square test for each contingency table
for state, table in contingency_tables_region.items():
    # Drop rows with missing values, if any, to ensure valid input
    table = table.fillna(0).astype(int)
    
    # Perform the Chi-square test
    chi2, p, dof, expected = chi2_contingency(table)
    
    # Store the results
    chi2_results[state] = {
        'Chi2': chi2,
        'p-value': p,
        'Degrees of Freedom': dof,
        'Expected Frequencies': expected
    }
    
    # Print results for the current state
    print(f"\n--- Analysis for {state} ---")
    print(f"Chi-Square Test Statistic: {chi2}")
    print(f"p-value: {p}")
    print(f"Degrees of Freedom: {dof}")
#     print(f"Expected Frequencies:\n{expected}")
    
    alpha = 0.05
    if p < alpha:
        print(f"Conclusion: For '{state}', the variables 'Region' and 'Candidate State' are likely to be dependent (significant association).")
    else:
        print(f"Conclusion: For '{state}', the variables 'Region' and 'Candidate State' are likely to be independent (no significant association).")


--- Analysis for Imported ---
Chi-Square Test Statistic: 3155.019021680986
p-value: 0.0
Degrees of Freedom: 22
Conclusion: For 'Imported', the variables 'Region' and 'Candidate State' are dependent (significant association).

--- Analysis for In selection ---
Chi-Square Test Statistic: 58.60831947120769
p-value: 3.5816827670575614e-05
Degrees of Freedom: 22
Conclusion: For 'In selection', the variables 'Region' and 'Candidate State' are dependent (significant association).

--- Analysis for First contact ---
Chi-Square Test Statistic: 53.10577079139694
p-value: 0.00021953020654831283
Degrees of Freedom: 22
Conclusion: For 'First contact', the variables 'Region' and 'Candidate State' are dependent (significant association).

--- Analysis for QM ---
Chi-Square Test Statistic: 30.40062341794713
p-value: 0.08422824252642458
Degrees of Freedom: 21
Conclusion: For 'QM', the variables 'Region' and 'Candidate State' are independent (no significant association).

--- Analysis for Vivier ---
Ch

For both 'Age Range' and 'Region' in some cells in the contingency table for various candidate states, the #observations is <5. This does not adhere with the rules of thumb of the Chi-squared test, this needs to be accounted for still. 