In [32]:
import polars as pl
import altair as alt

research_ethics_score = pl.read_csv("data/5_1_research_ethics_score.csv")

In [33]:
def draw_bar_plot(df, x, y, title, MAP, rotate=True, legend=True, scale=None):

    bar_x = alt.X(x, title=title, axis=None)
    bar_x = bar_x if MAP is None else bar_x.sort(MAP.values())

    bar_y = alt.Y(y, title=None, scale=scale)

    lgd_x = alt.Legend(orient='bottom', direction='horizontal')
    color = alt.Color(x, title=None, legend=lgd_x) if legend else alt.Color(x, title=None, legend=None)
    color = color.sort(MAP.values()) if MAP is not None else color

    bar = df.plot.bar().encode(
        x=bar_x,
        y=bar_y,
        color=color,
    ).properties(
        title=title,
        width=400,
    )

    txt_x = alt.X(x, sort=MAP.values()) if MAP is not None else alt.X(x)
    text = bar.mark_text(
        align="center",
        baseline="bottom",
    ).encode(
        x=txt_x,
        y=y,
        text=alt.Text(y, format=".2f"),
        color=alt.value("black"),
    )
    return bar + text

In [34]:
#
# 가. 문항별 기술 통계량
#
mean = research_ethics_score.mean().drop("ID").write_csv("figure/5/가_문항별_기술_통계량_평균.csv", include_bom=True)
standard_deviaiton = research_ethics_score.std().drop("ID").write_csv("figure/5/가_문항별_기술_통계량_표준편차.csv", include_bom=True)

In [35]:
# 
# 나. 영역별 기술 통계량
# 
# *Q39_1: 동물실험에 대한 연구윤리
#
공정 = (pl.col("Q38_1") + pl.col("Q38_2") + pl.col("Q38_3") + pl.col("Q38_4")) / 4
정직 = (pl.col("Q38_5") + pl.col("Q38_6") + pl.col("Q38_7") + pl.col("Q38_8")) / 4
존중 = pl.when(pl.col("Q39_1").is_null()).then(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11")) / 3
).otherwise(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11") + pl.col("Q39_1")) / 4
)

책임 = (pl.col("Q38_12") + pl.col("Q38_13") + pl.col("Q38_14") + pl.col("Q38_15")) / 4
투명성 = (pl.col("Q38_16") + pl.col("Q38_17") + pl.col("Q38_18") + pl.col("Q38_19")) / 4
all = (공정 + 정직 + 존중 + 책임 + 투명성) / 5

by_area = research_ethics_score.select([
    pl.col("ID"),
    공정.alias("공정"),
    정직.alias("정직"),
    존중.alias("존중"),
    책임.alias("책임"),
    투명성.alias("투명성"),
    all.alias("전체")
])

by_area.write_csv("figure/5/영역.csv", include_bom=True)

by_area_mean = by_area.mean()
by_area_mean.write_csv("figure/5/나_영역별_기술_통계량_평균.csv", include_bom=True)
by_area_std = by_area.std()
by_area_std.write_csv("figure/5/나_영역별_기술_통계량_표준편차.csv", include_bom=True)

print(by_area_mean)
print(by_area_std)

shape: (1, 7)
┌──────────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┐
│ ID           ┆ 공정     ┆ 정직     ┆ 존중     ┆ 책임     ┆ 투명성   ┆ 전체     │
│ ---          ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ f64          ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞══════════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╡
│ 19078.952096 ┆ 4.226048 ┆ 4.471557 ┆ 4.424651 ┆ 4.229042 ┆ 4.303892 ┆ 4.331038 │
└──────────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┘
shape: (1, 7)
┌─────────────┬─────────┬──────────┬──────────┬──────────┬──────────┬──────────┐
│ ID          ┆ 공정    ┆ 정직     ┆ 존중     ┆ 책임     ┆ 투명성   ┆ 전체     │
│ ---         ┆ ---     ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ f64         ┆ f64     ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞═════════════╪═════════╪══════════╪══════════╪══════════╪══════════╪══════════╡
│ 5316.37811

In [36]:
tp_by_area = by_area_mean.drop("ID").transpose(include_header=True)

bar = draw_bar_plot(tp_by_area, "column", "column_0", "영역별 기술 통계량", None, rotate=False, legend=True, scale=alt.Scale(domain=[4, 4.8]))
bar.show()

In [37]:
# 
# 다. 판정결과
# 

def make_grade(column, excellent, great, good, bad):
    return pl.when(column >= excellent).then(5).when(column >= great).then(4).when(column >= good).then(3).when(column >= bad).then(2).otherwise(1)

grading_all = make_grade(pl.col("전체"), 4.70, 3.85, 3.40, 2.95)
grading_공정 = make_grade(pl.col("공정"), 5.0, 3.75, 3.0, 2.50)
grading_정직 = make_grade(pl.col("정직"), 5.0, 4.0, 3.5, 3.25)
grading_존중 = make_grade(pl.col("존중"), 5.0, 3.75, 3.0, 2.25)
grading_책임 = make_grade(pl.col("책임"), 5.0, 3.75, 3.25, 2.75)
grading_투명성 = make_grade(pl.col("투명성"), 5.0, 3.75, 3.25, 2.75)

grading = by_area.select([
    grading_all.alias("전체_채점"),
    grading_공정.alias("공정_채점"),
    grading_정직.alias("정직_채점"),
    grading_존중.alias("존중_채점"),
    grading_책임.alias("책임_채점"),
    grading_투명성.alias("투명성_채점"),
])

grading.write_csv("figure/5/다_판정결과.csv", include_bom=True)
research_ethics_score.select(
    research_ethics_score["ID"],
    grading["전체_채점"], grading["공정_채점"], grading["정직_채점"], grading["존중_채점"], grading["책임_채점"], grading["투명성_채점"],
).write_csv("data/5_1_1_research_ethics_grading.csv", include_bom=True)
grading.mean()

전체_채점,공정_채점,정직_채점,존중_채점,책임_채점,투명성_채점
f64,f64,f64,f64,f64,f64
4.113772,4.023952,4.167665,4.197605,4.023952,4.113772


In [38]:
all_count = grading.group_by("전체_채점").agg(pl.count("전체_채점").alias("전체")).rename({"전체_채점": "등급"})
공정_count = grading.group_by("공정_채점").agg(pl.count("공정_채점").alias("공정")).rename({"공정_채점": "등급"})
정직_count = grading.group_by("정직_채점").agg(pl.count("정직_채점").alias("정직")).rename({"정직_채점": "등급"})
존중_count = grading.group_by("존중_채점").agg(pl.count("존중_채점").alias("존중")).rename({"존중_채점": "등급"})
책임_count = grading.group_by("책임_채점").agg(pl.count("책임_채점").alias("책임")).rename({"책임_채점": "등급"})
투명성_count = grading.group_by("투명성_채점").agg(pl.count("투명성_채점").alias("투명성")).rename({"투명성_채점": "등급"})

def check_zero_and_concat(dataframe, column_name, basis=5):
    if dataframe.shape[0] < basis:
        t = pl.DataFrame({"count": [1], column_name: [0]})
        t = t.with_columns(
            pl.col("count").cast(pl.Int32),
            pl.col(column_name).cast(pl.UInt32),
        )
        return dataframe.extend(t)
    return dataframe

all_count = all_count.join(공정_count, on="등급", how="left") \
    .join(정직_count, on="등급", how="left") \
    .join(존중_count, on="등급", how="left") \
    .join(책임_count, on="등급", how="left") \
    .join(투명성_count, on="등급", how="left") \
    .fill_null(0) \
    .sort("등급")

all_count.write_csv("figure/5/다_연구윤리_영역별_연구윤리_수준_판정_결과_총합.csv", include_bom=True)

all_count = all_count.drop("등급").transpose(include_header=True).select(
    매우미흡=pl.col("column_0") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")),
    미흡=pl.col("column_1") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")),
    보통=pl.col("column_2") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")),
    우수=pl.col("column_3") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")),
    매우우수=pl.col("column_4") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")),
)

all_count.write_csv("figure/5/다_연구윤리_영역별_연구윤리_수준_판정_결과_비율.csv", include_bom=True)
all_count

매우미흡,미흡,보통,우수,매우우수
f64,f64,f64,f64,f64
0.011976,0.053892,0.071856,0.532934,0.329341
0.0,0.005988,0.137725,0.682635,0.173653
0.041916,0.02994,0.053892,0.467066,0.407186
0.0,0.005988,0.107784,0.568862,0.317365
0.0,0.05988,0.095808,0.60479,0.239521
0.0,0.071856,0.02994,0.610778,0.287425


In [39]:
# 
# 라. 조사 대상별 차이 검정
# 
from scipy import stats
personal_info = pl.read_csv("data/6_personal_info.csv")

def group_per_personal(column):
    personality = personal_info.select([pl.col("ID"), pl.col(column)])
    by_area_per_personality = by_area.join(personality, on="ID", how="inner")
    return by_area_per_personality

def compute_group_personality_by_callback(by_area_per_personality, column, callback):
    return by_area_per_personality.group_by([column]).agg( \
        callback("공정").round(2), \
        callback("정직").round(2), \
        callback("존중").round(2), \
        callback("책임").round(2), \
        callback("투명성").round(2), \
        callback("전체").round(2)
    ).sort(column).drop(column).transpose(include_header=True)

# target
components = ["공정", "정직", "존중", "책임", "투명성", "전체"]
pl_components = pl.col(components)


# 1) 성별
sex = "Q42"
by_area_per_sex = group_per_personal(sex)

count = compute_group_personality_by_callback(by_area_per_sex, sex, pl.count).rename({"column_0": "남자수", "column_1": "여자수"})
pct = count.select(
    pl.col("column"),
    pl.col("남자수") / (pl.col("남자수") + pl.col("여자수")),
    pl.col("여자수") / (pl.col("남자수") + pl.col("여자수")),
)
mean = compute_group_personality_by_callback(by_area_per_sex, sex, pl.mean).rename({"column_0": "남자평균", "column_1": "여자평균"})
std = compute_group_personality_by_callback(by_area_per_sex, sex, pl.std).rename({"column_0": "남자표준편차", "column_1": "여자펴준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_1_성별_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all


column,남자수,여자수,남자평균,여자평균,남자수_right,여자수_right,남자표준편차,여자펴준편차
str,u32,u32,f64,f64,f64,f64,f64,f64
"""공정""",121,46,4.29,4.05,0.724551,0.275449,0.57,0.52
"""정직""",121,46,4.51,4.38,0.724551,0.275449,0.6,0.58
"""존중""",121,46,4.45,4.37,0.724551,0.275449,0.58,0.55
"""책임""",121,46,4.25,4.18,0.724551,0.275449,0.62,0.55
"""투명성""",121,46,4.33,4.24,0.724551,0.275449,0.64,0.49
"""전체""",121,46,4.36,4.24,0.724551,0.275449,0.55,0.47


In [40]:
male = by_area_per_sex.filter(pl.col(sex) == 1).select(pl_components)
female = by_area_per_sex.filter(pl.col(sex) == 2).select(pl_components)

ttest = stats.ttest_ind(male, female)
ttest_sex = pl.DataFrame({
    "component": components,
    "t-value": ttest[0],
    "p-value": ttest[1],
})

ttest_sex

component,t-value,p-value
str,f64,f64
"""공정""",2.445064,0.015534
"""정직""",1.291144,0.19846
"""존중""",0.793236,0.42878
"""책임""",0.654472,0.513719
"""투명성""",0.851303,0.395835
"""전체""",1.322251,0.187915


In [41]:
# 2) 연령
age = "Q43"
by_area_per_age = group_per_personal(age)

count = compute_group_personality_by_callback(by_area_per_age, age, pl.count).rename({"column_0": "20대수", "column_1": "30대수", "column_2": "40대수", "column_3": "50대수", "column_4": "60대수"})
pct = count.select(
    pl.col("column"),
    pl.col("20대수") / (pl.col("20대수") + pl.col("30대수") + pl.col("40대수") + pl.col("50대수") + pl.col("60대수")),
    pl.col("30대수") / (pl.col("20대수") + pl.col("30대수") + pl.col("40대수") + pl.col("50대수") + pl.col("60대수")),
    pl.col("40대수") / (pl.col("20대수") + pl.col("30대수") + pl.col("40대수") + pl.col("50대수") + pl.col("60대수")),
    pl.col("50대수") / (pl.col("20대수") + pl.col("30대수") + pl.col("40대수") + pl.col("50대수") + pl.col("60대수")),
    pl.col("60대수") / (pl.col("20대수") + pl.col("30대수") + pl.col("40대수") + pl.col("50대수") + pl.col("60대수")),
)
mean = compute_group_personality_by_callback(by_area_per_age, age, pl.mean).rename({"column_0": "20대평균", "column_1": "30대평균", "column_2": "40대평균", "column_3": "50대평균", "column_4": "60대평균"})
std = compute_group_personality_by_callback(by_area_per_age, age, pl.std).rename({"column_0": "20대표준편차", "column_1": "30대표준편차", "column_2": "40대표준편차", "column_3": "50대표준편차", "column_4": "60대표준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_2_연령대_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,20대수,30대수,40대수,50대수,60대수,20대평균,30대평균,40대평균,50대평균,60대평균,20대수_right,30대수_right,40대수_right,50대수_right,60대수_right,20대표준편차,30대표준편차,40대표준편차,50대표준편차,60대표준편차
str,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",23,54,48,33,9,4.25,4.15,4.24,4.32,4.17,0.137725,0.323353,0.287425,0.197605,0.053892,0.56,0.58,0.57,0.55,0.6
"""정직""",23,54,48,33,9,4.46,4.4,4.52,4.51,4.53,0.137725,0.323353,0.287425,0.197605,0.053892,0.59,0.58,0.62,0.65,0.51
"""존중""",23,54,48,33,9,4.42,4.44,4.4,4.43,4.41,0.137725,0.323353,0.287425,0.197605,0.053892,0.51,0.59,0.57,0.61,0.56
"""책임""",23,54,48,33,9,4.39,4.24,4.17,4.2,4.17,0.137725,0.323353,0.287425,0.197605,0.053892,0.57,0.61,0.63,0.63,0.5
"""투명성""",23,54,48,33,9,4.42,4.32,4.31,4.17,4.31,0.137725,0.323353,0.287425,0.197605,0.053892,0.55,0.6,0.58,0.68,0.69
"""전체""",23,54,48,33,9,4.39,4.31,4.33,4.33,4.31,0.137725,0.323353,0.287425,0.197605,0.053892,0.51,0.53,0.53,0.58,0.54


In [42]:

age_20 = by_area_per_age.filter(pl.col(age) == 1).select(pl_components)
age_30 = by_area_per_age.filter(pl.col(age) == 2).select(pl_components)
age_40 = by_area_per_age.filter(pl.col(age) == 3).select(pl_components)
age_50 = by_area_per_age.filter(pl.col(age) == 4).select(pl_components)
age_60 = by_area_per_age.filter(pl.col(age) == 5).select(pl_components)

f, p = stats.f_oneway(age_20, age_30, age_40, age_50, age_60)
anova_age = pl.DataFrame({
    "component": components,
    "f-value": f,
    "p-value":p,
})
anova_age.write_csv("figure/5/라_2_연령대_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_age

component,f-value,p-value
str,f64,f64
"""공정""",0.483742,0.747652
"""정직""",0.309263,0.871481
"""존중""",0.037862,0.997243
"""책임""",0.555954,0.694954
"""투명성""",0.615545,0.65205
"""전체""",0.087149,0.986335


In [43]:
#
# 3) 직위
#
role = "Q44"
by_area_per_role = group_per_personal(role)

count = compute_group_personality_by_callback(by_area_per_role, role, pl.count).rename({"column_0": "책임급수", "column_1": "선임급수", "column_2": "원급수", "column_3": "무기(선임급)수", "column_4": "무기(원급)수", "column_5": "기타수"})
pct = count.select(
    pl.col("column"),
    pl.col("책임급수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
    pl.col("선임급수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
    pl.col("원급수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
    pl.col("무기(선임급)수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
    pl.col("무기(원급)수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
    pl.col("기타수") / (pl.col("책임급수") + pl.col("선임급수") + pl.col("원급수") + pl.col("무기(선임급)수") + pl.col("무기(원급)수") + pl.col("기타수")),
)
mean = compute_group_personality_by_callback(by_area_per_role, role, pl.mean).rename({"column_0": "책임급평균", "column_1": "선임급평균", "column_2": "원급평균", "column_3": "무기(선임급)평균", "column_4": "무기(원급)평균", "column_5": "기타평균"})
std = compute_group_personality_by_callback(by_area_per_role, role, pl.std).rename({"column_0": "책임급표준편차", "column_1": "선임급표준편차", "column_2": "원급표준편차", "column_3": "무기(선임급)표준편차", "column_4": "무기(원급)표준편차", "column_5": "기타표준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_3_직위_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,책임급수,선임급수,원급수,무기(선임급)수,무기(원급)수,기타수,책임급평균,선임급평균,원급평균,무기(선임급)평균,무기(원급)평균,기타평균,책임급수_right,선임급수_right,원급수_right,무기(선임급)수_right,무기(원급)수_right,기타수_right,책임급표준편차,선임급표준편차,원급표준편차,무기(선임급)표준편차,무기(원급)표준편차,기타표준편차
str,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",58,50,15,5,29,10,4.36,4.22,4.15,3.8,4.16,4.0,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.49,0.55,0.5,0.57,0.71,0.6
"""정직""",58,50,15,5,29,10,4.57,4.46,4.28,4.2,4.47,4.38,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.54,0.57,0.55,0.89,0.72,0.58
"""존중""",58,50,15,5,29,10,4.5,4.41,4.4,4.03,4.44,4.29,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.52,0.57,0.42,0.74,0.66,0.69
"""책임""",58,50,15,5,29,10,4.25,4.22,4.35,3.8,4.24,4.2,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.56,0.61,0.49,0.54,0.7,0.72
"""투명성""",58,50,15,5,29,10,4.34,4.29,4.35,3.9,4.33,4.2,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.6,0.62,0.49,0.55,0.62,0.75
"""전체""",58,50,15,5,29,10,4.4,4.32,4.31,3.95,4.33,4.21,0.347305,0.299401,0.08982,0.02994,0.173653,0.05988,0.49,0.52,0.44,0.63,0.62,0.62


In [44]:
researcher = by_area_per_role.filter(pl.col(role) == 1).select(pl_components)
professor = by_area_per_role.filter(pl.col(role) == 2).select(pl_components)
manager = by_area_per_role.filter(pl.col(role) == 3).select(pl_components)
director = by_area_per_role.filter(pl.col(role) == 4).select(pl_components)
executive = by_area_per_role.filter(pl.col(role) == 5).select(pl_components)

anova = stats.f_oneway(researcher, professor, manager, director, executive)
anova_role = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
anova_role.write_csv("figure/5/라_3_직위_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_role

component,f-value,p-value
str,f64,f64
"""공정""",1.686617,0.15593
"""정직""",1.010214,0.404092
"""존중""",0.849843,0.495746
"""책임""",0.813332,0.518483
"""투명성""",0.669397,0.614183
"""전체""",0.940824,0.442077


In [45]:
# 
# 4) 학위
# 
degree = "Q45"
by_area_per_degree = group_per_personal(degree)

count = compute_group_personality_by_callback(by_area_per_degree, degree, pl.count).rename({"column_0": "학사수", "column_1": "석사수", "column_2": "박사수"})
pct = count.select(
    pl.col("column"),
    pl.col("학사수") / (pl.col("박사수") + pl.col("석사수") + pl.col("학사수")),
    pl.col("박사수") / (pl.col("박사수") + pl.col("석사수") + pl.col("학사수")),
    pl.col("석사수") / (pl.col("박사수") + pl.col("석사수") + pl.col("학사수")),
)
mean = compute_group_personality_by_callback(by_area_per_degree, degree, pl.mean).rename({"column_0": "학사평균", "column_1": "석사평균", "column_2": "박사평균"})
std = compute_group_personality_by_callback(by_area_per_degree, degree, pl.std).rename({"column_0": "학사표준편차", "column_1": "석사표준편차", "column_2": "박사표준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_4_학위_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,학사수,석사수,박사수,학사평균,석사평균,박사평균,학사수_right,박사수_right,석사수_right,학사표준편차,석사표준편차,박사표준편차
str,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",9,52,106,4.08,4.15,4.27,0.053892,0.634731,0.311377,0.84,0.6,0.53
"""정직""",9,52,106,4.17,4.44,4.51,0.053892,0.634731,0.311377,0.82,0.65,0.54
"""존중""",9,52,106,4.12,4.4,4.46,0.053892,0.634731,0.311377,0.9,0.57,0.53
"""책임""",9,52,106,4.14,4.25,4.22,0.053892,0.634731,0.311377,0.91,0.61,0.58
"""투명성""",9,52,106,4.03,4.36,4.3,0.053892,0.634731,0.311377,0.85,0.57,0.6
"""전체""",9,52,106,4.11,4.32,4.36,0.053892,0.634731,0.311377,0.85,0.54,0.49


In [46]:


bachelor = by_area_per_degree.filter(pl.col(degree) == 1).select(pl_components)
master = by_area_per_degree.filter(pl.col(degree) == 2).select(pl_components)
doctor = by_area_per_degree.filter(pl.col(degree) == 3).select(pl_components)
# postdoctor = by_area_per_degree.filter(pl.col(degree) == 4).select(pl_components)

anova = stats.f_oneway(bachelor, master, doctor)
anova_degree = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
anova_degree.write_csv("figure/5/라_4_학위_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_degree

component,f-value,p-value
str,f64,f64
"""공정""",1.076964,0.343025
"""정직""",1.485125,0.229504
"""존중""",1.6164,0.20176
"""책임""",0.149648,0.861128
"""투명성""",1.128388,0.326053
"""전체""",0.917041,0.40174


In [47]:
# 
# 5) 연구경력
# 

research_experience = "Q46"
by_area_per_research_experience = group_per_personal(research_experience)

count = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.count).rename({"column_0": "5년미만", "column_1": "5년이상10년미만수", "column_2": "10년이상15년미만수", "column_3": "15년이상수"})
pct = count.select(
    pl.col("column"),
    pl.col("5년미만") / (pl.col("5년미만") + pl.col("5년이상10년미만수") + pl.col("10년이상15년미만수") + pl.col("15년이상수")),
    pl.col("5년이상10년미만수") / (pl.col("5년미만") + pl.col("5년이상10년미만수") + pl.col("10년이상15년미만수") + pl.col("15년이상수")),
    pl.col("10년이상15년미만수") / (pl.col("5년미만") + pl.col("5년이상10년미만수") + pl.col("10년이상15년미만수") + pl.col("15년이상수")),
    pl.col("15년이상수") / (pl.col("5년미만") + pl.col("5년이상10년미만수") + pl.col("10년이상15년미만수") + pl.col("15년이상수")),
)
mean = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.mean).rename({"column_0": "5년미만평균", "column_1": "5년이상10년미만평균", "column_2": "10년이상15년미만평균", "column_3": "15년이상평균"})
std = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.std).rename({"column_0": "5년미만표준편차", "column_1": "5년이상10년미만표준편차", "column_2": "10년이상15년미만표준편차", "column_3": "15년이상표준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_5_연구경력_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,5년미만,5년이상10년미만수,10년이상15년미만수,15년이상수,5년미만평균,5년이상10년미만평균,10년이상15년미만평균,15년이상평균,5년미만_right,5년이상10년미만수_right,10년이상15년미만수_right,15년이상수_right,5년미만표준편차,5년이상10년미만표준편차,10년이상15년미만표준편차,15년이상표준편차
str,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",60,22,28,57,4.2,4.19,4.24,4.25,0.359281,0.131737,0.167665,0.341317,0.58,0.62,0.5,0.58
"""정직""",60,22,28,57,4.41,4.48,4.6,4.47,0.359281,0.131737,0.167665,0.341317,0.59,0.62,0.52,0.63
"""존중""",60,22,28,57,4.41,4.58,4.39,4.39,0.359281,0.131737,0.167665,0.341317,0.59,0.55,0.5,0.59
"""책임""",60,22,28,57,4.29,4.27,4.2,4.17,0.359281,0.131737,0.167665,0.341317,0.63,0.62,0.54,0.61
"""투명성""",60,22,28,57,4.37,4.28,4.38,4.21,0.359281,0.131737,0.167665,0.341317,0.59,0.69,0.44,0.66
"""전체""",60,22,28,57,4.34,4.36,4.36,4.3,0.359281,0.131737,0.167665,0.341317,0.55,0.54,0.41,0.57


In [48]:

experience_1 = by_area_per_research_experience.filter(pl.col(research_experience) == 1).select(pl_components)
experience_2 = by_area_per_research_experience.filter(pl.col(research_experience) == 2).select(pl_components)
experience_3 = by_area_per_research_experience.filter(pl.col(research_experience) == 3).select(pl_components)
experience_4 = by_area_per_research_experience.filter(pl.col(research_experience) == 4).select(pl_components)

anova = stats.f_oneway(experience_1, experience_2, experience_3, experience_4)
anova_research_experience = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
anova_research_experience.write_csv("figure/5/라_5_연구경력_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_research_experience

component,f-value,p-value
str,f64,f64
"""공정""",0.106364,0.956271
"""정직""",0.640982,0.589698
"""존중""",0.660098,0.577742
"""책임""",0.451089,0.716871
"""투명성""",0.799991,0.495544
"""전체""",0.119929,0.948278


In [49]:
# 
# 6) 연구분야
# 

from scipy.stats import tukey_hsd

research_field = "Q47"
by_area_per_research_field = group_per_personal(research_field)

count = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.count).rename({"column_0": "물리", "column_1": "화학", "column_2": "생물", "column_3": "지질", "column_4": "공학", "column_5": "정책", "column_6": "기타"})
pct = count.select(
    pl.col("column"),
    pl.col("물리") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("화학") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("생물") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("지질") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("공학") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("정책") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
    pl.col("기타") / (pl.col("물리") + pl.col("화학") + pl.col("생물") + pl.col("지질") + pl.col("공학") + pl.col("정책") + pl.col("기타")).round(2),
)
mean = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.mean).rename({"column_0": "물리평균", "column_1": "화학평균", "column_2": "생물평균", "column_3": "지질평균", "column_4": "공학평균", "column_5": "정책평균", "column_6": "기타평균"})
std = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.std).rename({"column_0": "물리표준편차", "column_1": "화학표준편차", "column_2": "생물표준편차", "column_3": "지질표준편차", "column_4": "공학표준편차", "column_5": "정책표준편차", "column_6": "기타표준편차"})

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)

all.write_csv("figure/5/라_6_연구분야_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,물리,화학,생물,지질,공학,정책,기타,물리평균,화학평균,생물평균,지질평균,공학평균,정책평균,기타평균,물리_right,화학_right,생물_right,지질_right,공학_right,정책_right,기타_right,물리표준편차,화학표준편차,생물표준편차,지질표준편차,공학표준편차,정책표준편차,기타표준편차
str,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",37,23,53,11,27,5,11,4.15,4.13,4.31,4.14,4.14,4.75,4.36,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.61,0.48,0.53,0.57,0.65,0.43,0.52
"""정직""",37,23,53,11,27,5,11,4.45,4.37,4.51,4.27,4.38,5.0,4.73,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.57,0.55,0.58,0.73,0.7,0.0,0.44
"""존중""",37,23,53,11,27,5,11,4.35,4.34,4.5,4.23,4.3,5.0,4.74,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.58,0.46,0.49,0.72,0.71,0.0,0.47
"""책임""",37,23,53,11,27,5,11,4.18,4.12,4.3,4.0,4.16,4.65,4.52,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.66,0.52,0.53,0.76,0.69,0.49,0.45
"""투명성""",37,23,53,11,27,5,11,4.35,4.12,4.34,4.05,4.19,4.95,4.57,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.61,0.57,0.54,0.73,0.7,0.11,0.46
"""전체""",37,23,53,11,27,5,11,4.3,4.22,4.39,4.14,4.23,4.87,4.58,0.221557,0.137725,0.317365,0.065868,0.161677,0.02994,0.065868,0.54,0.45,0.47,0.65,0.64,0.19,0.4


In [89]:
field_1 = by_area_per_research_field.filter(pl.col(research_field) == 1).select(pl_components)
field_2 = by_area_per_research_field.filter(pl.col(research_field) == 2).select(pl_components)
field_3 = by_area_per_research_field.filter(pl.col(research_field) == 3).select(pl_components)
field_4 = by_area_per_research_field.filter(pl.col(research_field) == 4).select(pl_components)
field_5 = by_area_per_research_field.filter(pl.col(research_field) == 5).select(pl_components)
field_6 = by_area_per_research_field.filter(pl.col(research_field) == 6).select(pl_components)
field_7 = by_area_per_research_field.filter(pl.col(research_field) == 7).select(pl_components)

anova = stats.f_oneway(field_1, field_2, field_3, field_4, field_5, field_6, field_7)

from statsmodels.stats.multicomp import pairwise_tukeyhsd

target = by_area_per_research_field[research_field].to_pandas().astype('category')
tukey_hsd = pairwise_tukeyhsd(by_area_per_research_field['존중'], target)
print(tukey_hsd)

# tukey_hsd = pairwise_tukeyhsd(by_area_per_research_field['투명성'], by_area_per_research_field[research_field])
# print(tukey_hsd)

anova_research_field = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).select("component", pl.col("f-value").round(3), pl.col("p-value").round(3))
anova_research_field.write_csv("figure/5/라_6_연구분야_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_research_field

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2  -0.0049    1.0 -0.4475 0.4377  False
     1      3   0.1478 0.8792 -0.2094 0.5049  False
     1      4  -0.1143 0.9969 -0.6867 0.4582  False
     1      5  -0.0528 0.9998 -0.4747 0.3691  False
     1      6   0.6509 0.1862 -0.1434 1.4452  False
     1      7   0.3933 0.3867 -0.1791 0.9658  False
     2      3   0.1527 0.9288 -0.2636 0.5689  False
     2      4  -0.1094 0.9983 -0.7204 0.5017  False
     2      5  -0.0479 0.9999 -0.5209 0.4251  False
     2      6   0.6558 0.2136 -0.1667 1.4783  False
     2      7   0.3982 0.4533 -0.2129 1.0093  False
     3      4   -0.262 0.7922 -0.8143 0.2903  False
     3      5  -0.2006 0.7327 -0.5947 0.1936  False
     3      6   0.5031 0.4659 -0.2767  1.283  False
     3      7   0.2456  0.838 -0.3067 0.7979  False
     4      5   0.0614 0.9999 -0.5348 0.6577  False
     4      

component,f-value,p-value
str,f64,f64
"""공정""",1.389,0.222
"""정직""",1.488,0.185
"""존중""",2.27,0.04
"""책임""",1.477,0.189
"""투명성""",2.316,0.036
"""전체""",2.071,0.059


In [51]:
# 
# 에고 그램
# 

egogram = "map"
by_area_per_egogram = group_per_personal(egogram)

# columns = ["E", "N", "R", "V", "W", "M", "L"]
columns = ["E", "N", "R", "W", "M", "L"]

count = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.count).rename(
    {"column_0": "E", "column_1": "N", "column_2": "R", "column_3": "W", "column_4": "M", "column_5": "L"}
)
pct = count.select(
    pl.col("column"),
    pl.col("E") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("N") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("R") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("W") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("M") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("L") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
)
mean = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.mean).rename(
    {"column_0": "Em", "column_1": "Nm", "column_2": "Rm", "column_3": "Wm", "column_4": "Mm", "column_5": "Lm"}
)
std = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.std).rename(
    {"column_0": "Estd", "column_1": "Nstd", "column_2": "Rstd", "column_3": "Wstd", "column_4": "Mstd", "column_5": "Lstd"}
)

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_7_에고그램_조사_대상별_차이_검정_통계량.csv", include_bom=True)
all

column,E,N,R,W,M,L,Em,Nm,Rm,Wm,Mm,Lm,E_right,N_right,R_right,W_right,M_right,L_right,Estd,Nstd,Rstd,Wstd,Mstd,Lstd
str,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""공정""",73,34,5,51,1,3,4.2,4.39,4.5,4.2,3.5,3.42,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.56,0.53,0.59,0.54,,0.95
"""정직""",73,34,5,51,1,3,4.4,4.59,4.95,4.53,2.75,3.5,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.59,0.58,0.11,0.48,,1.32
"""존중""",73,34,5,51,1,3,4.35,4.5,4.77,4.52,2.75,3.75,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.55,0.56,0.32,0.51,,1.09
"""책임""",73,34,5,51,1,3,4.19,4.33,4.8,4.24,2.75,3.42,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.63,0.57,0.27,0.51,,0.95
"""투명성""",73,34,5,51,1,3,4.26,4.43,4.65,4.32,2.75,3.5,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.63,0.52,0.55,0.54,,1.09
"""전체""",73,34,5,51,1,3,4.28,4.45,4.73,4.36,2.9,3.52,0.437126,0.203593,0.02994,0.305389,0.005988,0.017964,0.54,0.5,0.35,0.43,,1.08


In [83]:
field_1 = by_area_per_egogram.filter(pl.col(egogram) == "E").select(pl_components)
field_2 = by_area_per_egogram.filter(pl.col(egogram) == "N").select(pl_components)
field_3 = by_area_per_egogram.filter(pl.col(egogram) == "R").select(pl_components)
field_4 = by_area_per_egogram.filter(pl.col(egogram) == "W").select(pl_components)
field_5 = by_area_per_egogram.filter(pl.col(egogram) == "M").select(pl_components)
field_6 = by_area_per_egogram.filter(pl.col(egogram) == "L").select(pl_components)

anova = stats.f_oneway(field_1, field_2, field_3, field_4, field_5, field_6)
anova_egogram = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})

anova_egogram.write_csv("figure/5/라_7_에고그램_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_egogram

component,f-value,p-value
str,f64,f64
"""공정""",2.525881,0.031261
"""정직""",4.986883,0.000284
"""존중""",3.850894,0.002533
"""책임""",3.724148,0.003232
"""투명성""",3.261818,0.007826
"""전체""",4.325498,0.001016


In [84]:
tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['공정'], by_area_per_egogram[egogram])
print(tukey_hsd)

tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['정직'], by_area_per_egogram[egogram])
print(tukey_hsd)

tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['존중'], by_area_per_egogram[egogram])
print(tukey_hsd)

tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['책임'], by_area_per_egogram[egogram])
print(tukey_hsd)

tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['투명성'], by_area_per_egogram[egogram])
print(tukey_hsd)

tukey_hsd = pairwise_tukeyhsd(by_area_per_egogram['전체'], by_area_per_egogram[egogram])
print(tukey_hsd)


Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     E      L   0.1945 0.5423  -0.138   0.527  False
     E      M   0.3048 0.8423 -0.4355  1.0451  False
     E      N   0.0009    1.0 -0.2914  0.2932  False
     E      R  -0.6952 0.8147 -2.3077  0.9172  False
     E      W  -0.7785 0.1694  -1.722  0.1649  False
     L      M   0.1103 0.9984 -0.6568  0.8774  False
     L      N  -0.1936 0.6161 -0.5482   0.161  False
     L      R  -0.8897 0.6134 -2.5146  0.7352  False
     L      W   -0.973 0.0467 -1.9376 -0.0085   True
     M      N  -0.3039 0.8514 -1.0544  0.4466  False
     M      R     -1.0 0.5707 -2.7544  0.7544  False
     M      W  -1.0833 0.0867 -2.2529  0.0862  False
     N      R  -0.6961 0.8157 -2.3132  0.9211  False
     N      W  -0.7794 0.1758 -1.7309   0.172  False
     R      W  -0.0833    1.0 -1.9326  1.7659  False
----------------------------------------------

In [53]:
# 
# 도덕 정체성
# 

egogram = "map"
by_area_per_egogram = group_per_personal(egogram)

# columns = ["E", "N", "R", "V", "W", "M", "L"]
columns = ["E", "N", "R", "W", "M", "L"]

count = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.count).rename(
    {"column_0": "E", "column_1": "N", "column_2": "R", "column_3": "W", "column_4": "M", "column_5": "L"}
)
pct = count.select(
    pl.col("column"),
    pl.col("E") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("N") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("R") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("W") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("M") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
    pl.col("L") / (pl.col("E") + pl.col("N") + pl.col("R") + pl.col("W") + pl.col("M") + pl.col("L")),
)
mean = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.mean).rename(
    {"column_0": "Em", "column_1": "Nm", "column_2": "Rm", "column_3": "Wm", "column_4": "Mm", "column_5": "Lm"}
)
std = compute_group_personality_by_callback(by_area_per_egogram, egogram, pl.std).rename(
    {"column_0": "Estd", "column_1": "Nstd", "column_2": "Rstd", "column_3": "Wstd", "column_4": "Mstd", "column_5": "Lstd"}
)

all = count.join(
    mean, on="column", how="inner", 
).join(
    pct, on="column", how="inner"
).join(
    std, on="column", how="inner"
)
all.write_csv("figure/5/라_6_연구분야_조사_대상별_차이_검정_통계량.csv", include_bom=True)

field_1 = by_area_per_egogram.filter(pl.col(egogram) == "E").select(pl_components)
field_2 = by_area_per_egogram.filter(pl.col(egogram) == "N").select(pl_components)
field_3 = by_area_per_egogram.filter(pl.col(egogram) == "R").select(pl_components)
field_4 = by_area_per_egogram.filter(pl.col(egogram) == "W").select(pl_components)
field_5 = by_area_per_egogram.filter(pl.col(egogram) == "M").select(pl_components)
field_6 = by_area_per_egogram.filter(pl.col(egogram) == "L").select(pl_components)

anova = stats.f_oneway(field_1, field_2, field_3, field_4, field_5, field_6)
anova_egogram = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})

anova_egogram.write_csv("figure/5/라_7_에고그램_조사_대상별_차이_검정_anova.csv", include_bom=True)
anova_egogram

component,f-value,p-value
str,f64,f64
"""공정""",2.525881,0.031261
"""정직""",4.986883,0.000284
"""존중""",3.850894,0.002533
"""책임""",3.724148,0.003232
"""투명성""",3.261818,0.007826
"""전체""",4.325498,0.001016


In [54]:
from scipy.stats import pearsonr

def pearsonr_pval(df):
   names = pl.DataFrame(df.columns, schema=["name"])
   return names.hstack(
      pl.DataFrame(
         ([pearsonr(x, y)[1] for x in df] for y in df),
         schema = df.columns
      )
   )

# 
# 지수별 상관분석
# 
# grading.pipe(pearsonr_pval)
grading.select(
   pl.col("공정_채점"),
    pl.col("정직_채점"),
    pl.col("존중_채점"),
    pl.col("책임_채점"),
    pl.col("투명성_채점"),
    pl.col("전체_채점"),
).corr()

공정_채점,정직_채점,존중_채점,책임_채점,투명성_채점,전체_채점
f64,f64,f64,f64,f64,f64
1.0,0.668924,0.649873,0.626919,0.61271,0.755078
0.668924,1.0,0.702698,0.658206,0.692634,0.852493
0.649873,0.702698,1.0,0.693606,0.744445,0.823025
0.626919,0.658206,0.693606,1.0,0.817017,0.791937
0.61271,0.692634,0.744445,0.817017,1.0,0.838162
0.755078,0.852493,0.823025,0.791937,0.838162,1.0


In [55]:
grading.mean()

전체_채점,공정_채점,정직_채점,존중_채점,책임_채점,투명성_채점
f64,f64,f64,f64,f64,f64
4.113772,4.023952,4.167665,4.197605,4.023952,4.113772


In [56]:
grading.std()

전체_채점,공정_채점,정직_채점,존중_채점,책임_채점,투명성_채점
f64,f64,f64,f64,f64,f64
0.846105,0.580321,0.967251,0.642281,0.76009,0.771629


In [57]:
# 
# elastic net
# 
from attributes import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNetCV

# 연구윤리 인식 영역, 연구윤리 교육 영역, 기관에 대한 인식, 인구통계학적 배경 변인
total = pl.read_csv("data/1_total.csv")

# pd_by_area = by_area.drop("ID").to_pandas()
pd_by_area = by_area['전체'].to_pandas()
pd_total = total.drop("ID").to_pandas()

kf = KFold(n_splits=100, shuffle=True, random_state=42)


count = np.zeros((pd_total.shape[1]))
mean = np.zeros(count.shape)

iteration = 100
for i in range(iteration):

    regr = ElasticNetCV(cv=100, random_state=i)

    X, y = pd_total, pd_by_area
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7)

    regr.fit(train_x, train_y)

    count += regr.coef_  != 0
    mean += regr.coef_

mean /= iteration

In [61]:
print(pd_total.head())
print(count)
print(mean.round(3))

   Q1  Q2  Q3_1  Q3_2  Q3_3  Q3_4  Q3_5  Q3_6  Q5  Q6_1  ...  Q47_7  map_L  \
0   4   7     4     4     5     4     4     4   3     1  ...      0      0   
1   4   9     3     3     3     3     3     4   4     1  ...      0      0   
2   5   8     4     5     4     4     4     4   4     1  ...      0      0   
3   5   8     5     4     5     5     4     4   3     0  ...      1      0   
4   5   9     4     4     3     4     4     4   4     1  ...      0      0   

   map_M  map_N  map_R  map_W     _duplicated_0  _duplicated_1  _duplicated_2  
0      0      0      0      0  0              1              0              0  
1      0      0      0      0  0              1              0              0  
2      0      0      0      0  0              1              0              0  
3      1      0      0      0  0              1              0              0  
4      0      0      1      0  0              0              0              0  

[5 rows x 58 columns]
[100. 100.  23.  15.  79.  7

In [59]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


X, y = make_regression(n_targets=10)
print('Feature vector:', X.shape)
print('Target vector:', y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

print('Build and fit a regressor model...')

model = RandomForestRegressor()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print('Done. Score', score)

Feature vector: (100, 100)
Target vector: (100, 10)
Build and fit a regressor model...
Done. Score 0.3268548542287268
