In [29]:
import sys
sys.path.append(r'C:\Users\Josh Ellis\OneDrive - University of Nebraska at Omaha\COURSES\FALL_2022\ISQA8156-820\course-project')

import pandas as pd
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.formula.api import ols
import statsmodels.api as sm

from src.statistical_methods import FishersLSD
from src.processing import prepost_transform

Transform Data

In [15]:
data = pd.read_csv(
    'C:/Users/Josh Ellis/OneDrive - University of Nebraska at Omaha/' +
    'COURSES/FALL_2022/ISQA8156-820/course-project/data/processed/girls_rock_data.csv')

data = prepost_transform(data)

In [17]:
data.head()

Unnamed: 0,client,age_group,year,age,race/ethnicity,question,score_pretest,score_posttest,delta
0,2018,older group,2018,14,caucasian,15,4.0,7.0,3.0
1,2018,older group,2018,14,caucasian,15,4.0,7.0,3.0
2,12018,older group,2018,14,caucasian,15,6.0,5.0,-1.0
3,12018,older group,2018,14,caucasian,15,6.0,5.0,-1.0
4,22018,older group,2018,16,caucasian,15,7.0,7.0,0.0


---

### Research Question 1
Are there significant differences in answer ratings for various outcomes between the younger group and the older group? This question will help OGR better understand which questions significantly improve over time and identify the questions that don’t show a significant change, dependent on the age group of participants.

***Hypotheses***  
$$H_0: \mu_d \le 0$$
$$H_a: \mu_d > 0$$

In [18]:
age_data = data[['age_group', 'question', 'score_pretest', 'score_posttest', 'delta']].copy()

data_li = []
for group in age_data['age_group'].unique():
    for question in age_data['question'].unique():
        test_data = (
            age_data[
                (age_data['age_group'] == group) &
                (age_data['question'] == question)]
            [['age_group', 'question', 'score_pretest', 'score_posttest']]
        )
        data_li.append(test_data)

group_li = []
question_li = []
pretest_mean_li = []
posttest_mean_li = []
test_statistic_li = []
pvalue_li = []
conclusion_li = []

for sample in data_li:
    # Get Pre and Post Test Scores
    pretest_scores = sample['score_pretest']
    posttest_scores = sample['score_posttest']

    # Run dependent t-test
    test = stats.ttest_rel(
        posttest_scores,
        pretest_scores,
        alternative='greater')
    
    # Create Results
    pvalue = test[1]
    
    if pvalue < .05:
        conclusion_li.append('Significant')
    else:
        conclusion_li.append('Not Significant')
    
    pvalue_li.append(round(pvalue, 4))
    test_statistic_li.append(round(test[0], 4))
    group_li.append(sample.iloc[0,0])
    question_li.append(sample.iloc[0,1])
    pretest_mean_li.append(round(pretest_scores.mean(), 4))
    posttest_mean_li.append(round(posttest_scores.mean(), 4))
    
results = pd.DataFrame({
    'group': group_li,
    'question': question_li,
    'pre-test mean': pretest_mean_li,
    'post-test mean': posttest_mean_li,
    'test statistic': test_statistic_li,
    'p-value': pvalue_li,
    'conclusion': conclusion_li
})

results

Unnamed: 0,group,question,pre-test mean,post-test mean,test statistic,p-value,conclusion
0,older group,15,5.8296,5.9926,1.7094,0.0448,Significant
1,older group,16,5.2963,5.6222,3.2889,0.0006,Significant
2,older group,17,5.6519,5.8593,1.915,0.0288,Significant
3,older group,18,5.8,6.037,2.8582,0.0025,Significant
4,older group,19,5.9407,6.0889,1.782,0.0385,Significant
5,older group,20,4.7037,5.2296,4.2917,0.0,Significant
6,older group,21,5.9259,5.9185,-0.0733,0.5291,Not Significant
7,older group,22,4.6815,5.0222,2.6805,0.0041,Significant
8,older group,23,3.6889,4.3259,5.0734,0.0,Significant
9,older group,24,5.0741,5.4963,3.5156,0.0003,Significant


In [19]:
import numpy as np

y_mean_li = []
o_mean_li = []
question_li = []
pvalue_li = []
test_statistic_li = []
conclusion_li = []

for question in age_data['question'].unique():
    younger_delta = age_data[(age_data['question']==question) & (age_data['age_group']=='younger group')]['delta']
    older_delta = age_data[(age_data['question']==question) & (age_data['age_group']=='older group')]['delta']
    
    ttest = stats.ttest_ind(younger_delta, older_delta)
    test_statistic = ttest[0]
    pvalue = ttest[1]
    
    if pvalue < .05:
        conclusion_li.append('Significant')
    else:
        conclusion_li.append('Not Significant')
    
    question_li.append(question)
    y_mean_li.append(np.mean(younger_delta))
    o_mean_li.append(np.mean(older_delta))
    test_statistic_li.append(test_statistic)
    pvalue_li.append(pvalue)
    
results = pd.DataFrame({
    'question': question_li,
    'younger group mean': y_mean_li,
    'older group mean': o_mean_li,
    'test statistic': test_statistic_li,
    'p-value': pvalue_li,
    'conclusion': conclusion_li
})

results

Unnamed: 0,question,younger group mean,older group mean,test statistic,p-value,conclusion
0,15,-0.021739,0.162963,-1.025061,0.306718,Not Significant
1,16,0.326087,0.325926,0.000757,0.999397,Not Significant
2,17,0.065217,0.207407,-0.700559,0.484488,Not Significant
3,18,0.23913,0.237037,0.011578,0.990775,Not Significant
4,19,0.021739,0.148148,-0.686852,0.493065,Not Significant
5,20,0.108696,0.525926,-1.663988,0.097865,Not Significant
6,21,0.413043,-0.007407,1.877029,0.062141,Not Significant
7,22,0.347826,0.340741,0.025756,0.979481,Not Significant
8,23,0.673913,0.637037,0.14249,0.886853,Not Significant
9,24,0.26087,0.422222,-0.652273,0.515062,Not Significant


---

### Question 2: One way ANOVA
Does the race/ethnicity of participants influence the change in scores for various questions over time?

In [30]:
race_data = data[['race/ethnicity', 'question', 'score_pretest', 'score_posttest', 'delta']].copy()
race_data.rename(columns={'race/ethnicity': 'race'}, inplace=True)

question_col = []
pvalue_col = []
result_col = []

for question in race_data['question'].unique():    
    anova_race = race_data[race_data['question']==question]
    
    anova_race = anova_race[['race', 'delta']]

    model = ols('delta ~ C(race)', data=anova_race).fit()
    anova_table = sm.stats.anova_lm(model,typ=1)
    pvalue = round(anova_table.iloc[0,-1],3)
    
    question_col.append(question)
    pvalue_col.append(pvalue)
    
    if pvalue > 0.05:
        result_col.append('Not Significant')
    else:
        result_col.append('Significant')

results_df = pd.DataFrame({
    'question': question_col,
    'pvalue': pvalue_col,
    'result': result_col
})

results_df

Unnamed: 0,question,pvalue,result
0,15,0.409,Not Significant
1,16,0.408,Not Significant
2,17,0.437,Not Significant
3,18,0.131,Not Significant
4,19,0.618,Not Significant
5,20,0.434,Not Significant
6,21,0.026,Significant
7,22,0.063,Not Significant
8,23,0.025,Significant
9,24,0.301,Not Significant


In [31]:
px.bar(
       results_df, x='question', y='pvalue',
       color='result',
       title='One-Way Anova Results <br><i>Treatment: race/ethnicity | Response: Change between pre-test and post-test Scores<br><i>Level of Significance = 5%',
       text_auto=True, template='simple_white')

Fishers LSD for questions 21

In [None]:
lsd = FishersLSD(df=race_data, treatment='race', response='delta', groupby='question', groupby_value='21', confidence=0.95)

In [None]:
lsd.table()

In [None]:
lsd.plot()

Fishers LSD for Question 23

In [None]:
lsd = FishersLSD(df=race_data, treatment='race', response='delta', groupby='question', groupby_value='23', confidence=0.95)

In [None]:
lsd.table()

In [None]:
lsd.plot()