In [532]:
import polars as pl
import altair as alt

research_ethics_score = pl.read_csv("data/5_1_research_ethics_score.csv")

In [533]:
def draw_bar_plot(df, x, y, title, MAP, rotate=True, legend=True, scale=None):

    bar_x = alt.X(x, title=title, axis=None)
    bar_x = bar_x if MAP is None else bar_x.sort(MAP.values())

    bar_y = alt.Y(y, title=None, scale=scale)

    lgd_x = alt.Legend(orient='bottom', direction='horizontal')
    color = alt.Color(x, title=None, legend=lgd_x) if legend else alt.Color(x, title=None, legend=None)
    color = color.sort(MAP.values()) if MAP is not None else color

    bar = df.plot.bar().encode(
        x=bar_x,
        y=bar_y,
        color=color,
    ).properties(
        title=title,
        width=400,
    )

    txt_x = alt.X(x, sort=MAP.values()) if MAP is not None else alt.X(x)
    text = bar.mark_text(
        align="center",
        baseline="bottom",
    ).encode(
        x=txt_x,
        y=y,
        text=alt.Text(y, format=".2f"),
        color=alt.value("black"),
    )
    return bar + text

In [534]:
#
# 가. 문항별 기술 통계량
#
mean = research_ethics_score.mean().drop("ID").write_csv("figure/5/가_문항별_기술_통계량_평균.csv")
standard_deviaiton = research_ethics_score.std().drop("ID").write_csv("figure/5/가_문항별_기술_통계량_표준편차.csv")

In [550]:
# 
# 나. 영역별 기술 통계량
# 
# *Q39_1: 동물실험에 대한 연구윤리
#
fairness = (pl.col("Q38_1") + pl.col("Q38_2") + pl.col("Q38_3") + pl.col("Q38_4")) / 4
honesty = (pl.col("Q38_5") + pl.col("Q38_6") + pl.col("Q38_7") + pl.col("Q38_8")) / 4
respect = pl.when(pl.col("Q39_1").is_null()).then(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11")) / 3
).otherwise(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11") + pl.col("Q39_1")) / 4
)

responsibility = (pl.col("Q38_12") + pl.col("Q38_13") + pl.col("Q38_14") + pl.col("Q38_15")) / 4
transparency = (pl.col("Q38_16") + pl.col("Q38_17") + pl.col("Q38_18") + pl.col("Q38_19")) / 4
all = (fairness + honesty + respect + responsibility + transparency) / 5

by_area = research_ethics_score.select([
    pl.col("ID"),
    fairness.alias("공정"),
    honesty.alias("정직"),
    respect.alias("존중"),
    responsibility.alias("책임"),
    transparency.alias("투명성"),
    all.alias("전체")
])

by_area_mean = by_area.mean()
by_area_mean.write_csv("figure/5/나_영역별_기술_통계량_평균.csv")
by_area_std = by_area.std()
by_area_std.write_csv("figure/5/나_영역별_기술_통계량_표준편차.csv")


shape: (167, 7)
┌───────┬──────┬──────┬──────────┬──────┬────────┬──────────┐
│ ID    ┆ 공정 ┆ 정직 ┆ 존중     ┆ 책임 ┆ 투명성 ┆ 전체     │
│ ---   ┆ ---  ┆ ---  ┆ ---      ┆ ---  ┆ ---    ┆ ---      │
│ i64   ┆ f64  ┆ f64  ┆ f64      ┆ f64  ┆ f64    ┆ f64      │
╞═══════╪══════╪══════╪══════════╪══════╪════════╪══════════╡
│ 14630 ┆ 3.75 ┆ 3.5  ┆ 3.5      ┆ 3.5  ┆ 3.0    ┆ 3.45     │
│ 14629 ┆ 4.25 ┆ 4.5  ┆ 4.75     ┆ 4.0  ┆ 4.0    ┆ 4.3      │
│ 14626 ┆ 3.75 ┆ 4.0  ┆ 3.75     ┆ 4.0  ┆ 4.0    ┆ 3.9      │
│ 14624 ┆ 4.25 ┆ 5.0  ┆ 5.0      ┆ 4.25 ┆ 5.0    ┆ 4.7      │
│ 14627 ┆ 4.5  ┆ 4.75 ┆ 4.5      ┆ 4.5  ┆ 4.25   ┆ 4.5      │
│ …     ┆ …    ┆ …    ┆ …        ┆ …    ┆ …      ┆ …        │
│ 27015 ┆ 3.75 ┆ 3.75 ┆ 3.333333 ┆ 3.25 ┆ 4.0    ┆ 3.616667 │
│ 27046 ┆ 3.75 ┆ 3.75 ┆ 4.0      ┆ 4.0  ┆ 4.0    ┆ 3.9      │
│ 27725 ┆ 3.0  ┆ 3.0  ┆ 3.0      ┆ 3.0  ┆ 3.0    ┆ 3.0      │
│ 27863 ┆ 5.0  ┆ 5.0  ┆ 5.0      ┆ 4.75 ┆ 4.5    ┆ 4.85     │
│ 28045 ┆ 3.5  ┆ 4.0  ┆ 3.75     ┆ 4.0  ┆ 4.0    ┆ 3.85     │
└────

In [551]:
tp_by_area = by_area_mean.drop("ID").transpose(include_header=True)

bar = draw_bar_plot(tp_by_area, "column", "column_0", "영역별 기술 통계량", None, rotate=False, legend=True, scale=alt.Scale(domain=[4, 4.8]))
bar.show()

In [560]:
# 
# 다. 판정결과
# 

def make_grade(column, excellent, great, good, bad):
    return pl.when(column >= excellent).then(5).when(column >= great).then(4).when(column >= good).then(3).when(column >= bad).then(2).otherwise(1)

grading_all = make_grade(pl.col("전체"), 4.70, 3.85, 3.40, 2.95)
grading_fairness = make_grade(pl.col("공정"), 5.0, 3.75, 3.0, 2.50)
grading_honesty = make_grade(pl.col("정직"), 5.0, 4.0, 3.5, 3.25)
grading_respect = make_grade(pl.col("존중"), 5.0, 3.75, 3.0, 2.25)
grading_responsibility = make_grade(pl.col("책임"), 5.0, 3.75, 3.25, 2.75)
grading_transparency = make_grade(pl.col("투명성"), 5.0, 3.75, 3.25, 2.75)

grading = by_area.select([
    grading_all.alias("전체_채점"),
    grading_fairness.alias("공정_채점"),
    grading_honesty.alias("정직_채점"),
    grading_respect.alias("존중_채점"),
    grading_responsibility.alias("책임_채점"),
    grading_transparency.alias("투명성_채점"),
])

print(grading)

shape: (167, 6)
┌───────────┬───────────┬───────────┬───────────┬───────────┬─────────────┐
│ 전체_채점 ┆ 공정_채점 ┆ 정직_채점 ┆ 존중_채점 ┆ 책임_채점 ┆ 투명성_채점 │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ i32       ┆ i32       ┆ i32       ┆ i32       ┆ i32       ┆ i32         │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ 3         ┆ 4         ┆ 3         ┆ 3         ┆ 3         ┆ 2           │
│ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4           │
│ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4           │
│ 5         ┆ 4         ┆ 5         ┆ 5         ┆ 4         ┆ 5           │
│ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4         ┆ 4           │
│ …         ┆ …         ┆ …         ┆ …         ┆ …         ┆ …           │
│ 3         ┆ 4         ┆ 3         ┆ 3         ┆ 3         ┆ 4           │
│ 4         ┆ 4         ┆ 3         ┆ 4         ┆ 4         ┆ 4           │
│ 2         ┆ 3      

In [577]:
all_count = grading.group_by("전체_채점").agg(pl.count("전체_채점").alias("전체")).rename({"전체_채점": "등급"})
fairness_count = grading.group_by("공정_채점").agg(pl.count("공정_채점").alias("공정")).rename({"공정_채점": "등급"})
honesty_count = grading.group_by("정직_채점").agg(pl.count("정직_채점").alias("정직")).rename({"정직_채점": "등급"})
respect_count = grading.group_by("존중_채점").agg(pl.count("존중_채점").alias("존중")).rename({"존중_채점": "등급"})
responsibility_count = grading.group_by("책임_채점").agg(pl.count("책임_채점").alias("책임")).rename({"책임_채점": "등급"})
transparency_count = grading.group_by("투명성_채점").agg(pl.count("투명성_채점").alias("투명성")).rename({"투명성_채점": "등급"})

def check_zero_and_concat(dataframe, column_name, basis=5):
    if dataframe.shape[0] < basis:
        t = pl.DataFrame({"count": [1], column_name: [0]})
        t = t.with_columns(
            pl.col("count").cast(pl.Int32),
            pl.col(column_name).cast(pl.UInt32),
        )
        return dataframe.extend(t)
    return dataframe

all_count = all_count.join(fairness_count, on="등급", how="left") \
    .join(honesty_count, on="등급", how="left") \
    .join(respect_count, on="등급", how="left") \
    .join(responsibility_count, on="등급", how="left") \
    .join(transparency_count, on="등급", how="left") \
    .fill_null(0) \
    .sort("등급")

all_count.write_csv("figure/5/다_연구윤리_영역별_연구윤리_수준_판정_결과_총합.csv")

all_count.drop("등급").transpose(include_header=True).select([
    pl.col("column_0") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")).alias("매우 미흡"),
    pl.col("column_1") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")).alias("미흡"),
    pl.col("column_2") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")).alias("보통"),
    pl.col("column_3") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")).alias("우수"),
    pl.col("column_4") / (pl.col("column_0") + pl.col("column_1") + pl.col("column_2") + pl.col("column_3") + pl.col("column_4")).alias("매우 우수"),
]).write_csv("figure/5/다_연구윤리_영역별_연구윤리_수준_판정_결과_비율.csv")

In [474]:
# 
# 라. 조사 대상별 차이 검정
# 
from scipy import stats
personal_info = pl.read_csv("data/6_personal_info.csv")

def group_per_personal(column):
    personality = personal_info.select([pl.col("ID"), pl.col(column)])
    by_area_per_personality = by_area.join(personality, on="ID", how="inner")
    return by_area_per_personality

def compute_group_personality_by_callback(by_area_per_personality, column, callback):
    return by_area_per_personality.group_by([column]).agg( \
        callback("fairness"), \
        callback("honesty"), \
        callback("respect"), \
        callback("responsibility"), \
        callback("transparency"), \
        callback("all")
    ).sort(column)

# target
components = ["fairness", "honesty", "respect", "responsibility", "transparency", "all"]
pl_components = pl.col(components)


# 1) 성별
sex = "Q42"
by_area_per_sex = group_per_personal(sex)

count_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.count)
mean_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.mean)
std_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.std)

male = by_area_per_sex.filter(pl.col(sex) == 1).select(pl_components)
female = by_area_per_sex.filter(pl.col(sex) == 2).select(pl_components)

ttest = stats.ttest_ind(male, female)
ttest_sex = pl.DataFrame({
    "component": components,
    "t-value": ttest[0],
    "p-value": ttest[1],
}).write_csv("figure/5/D_1_research_ethics_score_ttest.csv")

In [475]:
# 2) 연령
age = "Q43"
by_area_per_age = group_per_personal(age)

count_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.count)
mean_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.mean)
std_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.std)

age_20 = by_area_per_age.filter(pl.col(age) == 1).select(pl_components)
age_30 = by_area_per_age.filter(pl.col(age) == 2).select(pl_components)
age_40 = by_area_per_age.filter(pl.col(age) == 3).select(pl_components)
age_50 = by_area_per_age.filter(pl.col(age) == 4).select(pl_components)
age_60 = by_area_per_age.filter(pl.col(age) == 5).select(pl_components)

anova = stats.f_oneway(age_20, age_30, age_40, age_50, age_60)
anova_age = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).write_csv("figure/5/D_2_research_ethics_score_anova.csv")

In [476]:
#
# 3) 직위
#
role = "Q44"
by_area_per_role = group_per_personal(role)

count_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.count)
mean_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.mean)
std_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.std)

researcher = by_area_per_role.filter(pl.col(role) == 1).select(pl_components)
professor = by_area_per_role.filter(pl.col(role) == 2).select(pl_components)
manager = by_area_per_role.filter(pl.col(role) == 3).select(pl_components)
director = by_area_per_role.filter(pl.col(role) == 4).select(pl_components)
executive = by_area_per_role.filter(pl.col(role) == 5).select(pl_components)

anova = stats.f_oneway(researcher, professor, manager, director, executive)
anova_role = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).write_csv("figure/5/D_3_research_ethics_score_anova.csv")

In [477]:
# 
# 4) 학위
# 
degree = "Q45"
by_area_per_degree = group_per_personal(degree)

count_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.count)
mean_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.mean)
std_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.std)

bachelor = by_area_per_degree.filter(pl.col(degree) == 1).select(pl_components)
master = by_area_per_degree.filter(pl.col(degree) == 2).select(pl_components)
doctor = by_area_per_degree.filter(pl.col(degree) == 3).select(pl_components)
# postdoctor = by_area_per_degree.filter(pl.col(degree) == 4).select(pl_components)

anova = stats.f_oneway(bachelor, master, doctor)
anova_degree = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).write_csv("figure/5/D_4_research_ethics_score_anova.csv")

In [478]:
# 
# 5) 연구경력
# 

research_experience = "Q46"
by_area_per_research_experience = group_per_personal(research_experience)

count_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.count)
mean_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.mean)
std_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.std)

experience_1 = by_area_per_research_experience.filter(pl.col(research_experience) == 1).select(pl_components)
experience_2 = by_area_per_research_experience.filter(pl.col(research_experience) == 2).select(pl_components)
experience_3 = by_area_per_research_experience.filter(pl.col(research_experience) == 3).select(pl_components)
experience_4 = by_area_per_research_experience.filter(pl.col(research_experience) == 4).select(pl_components)

anova = stats.f_oneway(experience_1, experience_2, experience_3, experience_4)
anova_research_experience = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).write_csv("figure/5/D_5_research_ethics_score_anova.csv")

In [479]:
# 
# 6) 연구분야
# 

research_field = "Q47"
by_area_per_research_field = group_per_personal(research_field)

count_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.count)
mean_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.mean)
std_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.std)

field_1 = by_area_per_research_field.filter(pl.col(research_field) == 1).select(pl_components)
field_2 = by_area_per_research_field.filter(pl.col(research_field) == 2).select(pl_components)
field_3 = by_area_per_research_field.filter(pl.col(research_field) == 3).select(pl_components)
field_4 = by_area_per_research_field.filter(pl.col(research_field) == 4).select(pl_components)
field_5 = by_area_per_research_field.filter(pl.col(research_field) == 5).select(pl_components)
field_6 = by_area_per_research_field.filter(pl.col(research_field) == 6).select(pl_components)
field_7 = by_area_per_research_field.filter(pl.col(research_field) == 7).select(pl_components)

anova = stats.f_oneway(field_1, field_2, field_3, field_4, field_5, field_6, field_7)
anova_research_field = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
}).write_csv("figure/5/D_6_research_ethics_score_anova.csv")

In [480]:
# 
# elastic net
# 
from attributes import *
import polars_ols as pl_components

# 연구윤리 인식 영역, 연구윤리 교육 영역, 기관에 대한 인식, 인구통계학적 배경 변인
total = pl.read_csv("data/1_total.csv")


target = total.join(by_area, on="ID", how="left")
print(target)

features = total.drop('ID').columns

expr_fairness = pl.col("respect").least_squares.elastic_net(
    *features,
    mode="statistics",
    l1_ratio=0.5,
    alpha=0.1,
    max_iter=1000,
    tol=0.0001,
)

some = target.select(expr_fairness).unnest('statistics')
print(some)

to_select = pl.DataFrame({
    "features": features,
    "coefficients": some['coefficients'].to_list()[0],
    "p_values": some['p_values'].to_list()[0],
    "t_values": some['t_values'].to_list()[0],
})

selected = to_select.filter(
    ((pl.col("coefficients") > 0) & (pl.col("p_values") < 0.05)).alias("selected")
)

shape: (167, 124)
┌───────┬─────┬─────┬──────┬───┬──────────┬────────────────┬──────────────┬──────────┐
│ ID    ┆ Q1  ┆ Q2  ┆ Q3_1 ┆ … ┆ respect  ┆ responsibility ┆ transparency ┆ all      │
│ ---   ┆ --- ┆ --- ┆ ---  ┆   ┆ ---      ┆ ---            ┆ ---          ┆ ---      │
│ i64   ┆ i64 ┆ i64 ┆ i64  ┆   ┆ f64      ┆ f64            ┆ f64          ┆ f64      │
╞═══════╪═════╪═════╪══════╪═══╪══════════╪════════════════╪══════════════╪══════════╡
│ 14630 ┆ 4   ┆ 7   ┆ 4    ┆ … ┆ 3.5      ┆ 3.5            ┆ 3.0          ┆ 3.45     │
│ 14629 ┆ 4   ┆ 9   ┆ 3    ┆ … ┆ 4.75     ┆ 4.0            ┆ 4.0          ┆ 4.3      │
│ 14626 ┆ 5   ┆ 8   ┆ 4    ┆ … ┆ 3.75     ┆ 4.0            ┆ 4.0          ┆ 3.9      │
│ 14624 ┆ 5   ┆ 8   ┆ 5    ┆ … ┆ 5.0      ┆ 4.25           ┆ 5.0          ┆ 4.7      │
│ 14627 ┆ 5   ┆ 9   ┆ 4    ┆ … ┆ 4.5      ┆ 4.5            ┆ 4.25         ┆ 4.5      │
│ …     ┆ …   ┆ …   ┆ …    ┆ … ┆ …        ┆ …              ┆ …            ┆ …        │
│ 27015 ┆ 4   ┆ 8   ┆ 4  