In [222]:
import polars as pl
import altair as alt

research_ethics_score = pl.read_csv("data/5_1_research_ethics_score.csv")

In [223]:
#
# 가. 문항별 기술 통계량
#
average = research_ethics_score.mean()
print(average)

standard_deviaiton = research_ethics_score.std()
print(standard_deviaiton)

shape: (1, 21)
┌──────────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐
│ ID           ┆ Q38_1    ┆ Q38_2    ┆ Q38_3    ┆ … ┆ Q38_17   ┆ Q38_18   ┆ Q38_19   ┆ Q39_1    │
│ ---          ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ f64          ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
╞══════════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡
│ 19078.952096 ┆ 4.281437 ┆ 4.071856 ┆ 4.305389 ┆ … ┆ 4.281437 ┆ 4.365269 ┆ 4.275449 ┆ 4.132075 │
└──────────────┴──────────┴──────────┴──────────┴───┴──────────┴──────────┴──────────┴──────────┘
shape: (1, 21)
┌─────────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬─────────┐
│ ID          ┆ Q38_1    ┆ Q38_2    ┆ Q38_3    ┆ … ┆ Q38_17   ┆ Q38_18   ┆ Q38_19   ┆ Q39_1   │
│ ---         ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---   

In [224]:
# 
# 나. 영역별 기술 통계량
# 
# *Q39_1: 동물실험에 대한 연구윤리
#
fairness = (pl.col("Q38_1") + pl.col("Q38_2") + pl.col("Q38_3") + pl.col("Q38_4")) / 4
honesty = (pl.col("Q38_5") + pl.col("Q38_6") + pl.col("Q38_7") + pl.col("Q38_8")) / 4
respect = pl.when(pl.col("Q39_1").is_null()).then(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11")) / 3
).otherwise(
    (pl.col("Q38_9") + pl.col("Q38_10") + pl.col("Q38_11") + pl.col("Q39_1")) / 4
)

responsibility = (pl.col("Q38_12") + pl.col("Q38_13") + pl.col("Q38_14") + pl.col("Q38_15")) / 4
transparency = (pl.col("Q38_16") + pl.col("Q38_17") + pl.col("Q38_18") + pl.col("Q38_19")) / 4
all = (fairness + honesty + respect + responsibility + transparency) / 5

by_area = research_ethics_score.select([
    pl.col("ID"),
    fairness.alias("fairness"),
    honesty.alias("honesty"),
    respect.alias("respect"),
    responsibility.alias("responsibility"),
    transparency.alias("transparency"),
    all.alias("all")
])


by_area_mean = by_area.mean()
print(by_area_mean)

by_area_std = by_area.std()
print(by_area_std)

shape: (1, 7)
┌──────────────┬──────────┬──────────┬──────────┬────────────────┬──────────────┬──────────┐
│ ID           ┆ fairness ┆ honesty  ┆ respect  ┆ responsibility ┆ transparency ┆ all      │
│ ---          ┆ ---      ┆ ---      ┆ ---      ┆ ---            ┆ ---          ┆ ---      │
│ f64          ┆ f64      ┆ f64      ┆ f64      ┆ f64            ┆ f64          ┆ f64      │
╞══════════════╪══════════╪══════════╪══════════╪════════════════╪══════════════╪══════════╡
│ 19078.952096 ┆ 4.226048 ┆ 4.471557 ┆ 4.424651 ┆ 4.229042       ┆ 4.303892     ┆ 4.331038 │
└──────────────┴──────────┴──────────┴──────────┴────────────────┴──────────────┴──────────┘
shape: (1, 7)
┌─────────────┬──────────┬──────────┬──────────┬────────────────┬──────────────┬──────────┐
│ ID          ┆ fairness ┆ honesty  ┆ respect  ┆ responsibility ┆ transparency ┆ all      │
│ ---         ┆ ---      ┆ ---      ┆ ---      ┆ ---            ┆ ---          ┆ ---      │
│ f64         ┆ f64      ┆ f64      ┆ f64    

In [225]:
tp_by_area = by_area_mean.transpose(include_header=True)

tp_by_area.plot.bar().encode(
    x="column", y="column_0"
).properties(
    title="영역별 기술 통계량",
).show()

In [226]:
# 
# 다. 판정결과
# 

def make_grade(column, excellent, great, good, bad):
    return pl.when(column >= excellent).then(5).when(column >= great).then(4).when(column >= good).then(3).when(column >= bad).then(2).otherwise(1)

grading_all = make_grade(pl.col("all"), 4.70, 3.85, 3.40, 2.95)
grading_fairness = make_grade(pl.col("fairness"), 5.0, 3.75, 3.0, 2.50)
grading_honesty = make_grade(pl.col("honesty"), 5.0, 4.0, 3.5, 3.25)
grading_respect = make_grade(pl.col("respect"), 5.0, 3.75, 3.0, 2.25)
grading_responsibility = make_grade(pl.col("responsibility"), 5.0, 3.75, 3.25, 2.75)
grading_transparency = make_grade(pl.col("transparency"), 5.0, 3.75, 3.25, 2.75)

grading = by_area.select([
    grading_all.alias("g_all"),
    grading_fairness.alias("g_fairness"),
    grading_honesty.alias("g_honesty"),
    grading_respect.alias("g_respect"),
    grading_responsibility.alias("g_responsibility"),
    grading_transparency.alias("g_transparency"),
])

print(grading)

shape: (167, 6)
┌───────┬────────────┬───────────┬───────────┬──────────────────┬────────────────┐
│ g_all ┆ g_fairness ┆ g_honesty ┆ g_respect ┆ g_responsibility ┆ g_transparency │
│ ---   ┆ ---        ┆ ---       ┆ ---       ┆ ---              ┆ ---            │
│ i32   ┆ i32        ┆ i32       ┆ i32       ┆ i32              ┆ i32            │
╞═══════╪════════════╪═══════════╪═══════════╪══════════════════╪════════════════╡
│ 3     ┆ 4          ┆ 3         ┆ 3         ┆ 3                ┆ 2              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ 5     ┆ 4          ┆ 5         ┆ 5         ┆ 4                ┆ 5              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ …     ┆ …          ┆ …         ┆ …         ┆ …                ┆ …              │
│ 3     ┆ 4          ┆ 3         ┆ 3         ┆ 3                ┆ 4    

In [227]:
g_all_count = grading.group_by("g_all").agg(pl.count("g_all").alias("all")).rename({"g_all": "count"})
g_fairness_count = grading.group_by("g_fairness").agg(pl.count("g_fairness").alias("fairness_count")).rename({"g_fairness": "count"})
g_honesty_count = grading.group_by("g_honesty").agg(pl.count("g_honesty").alias("honesty_count")).rename({"g_honesty": "count"})
g_respect_count = grading.group_by("g_respect").agg(pl.count("g_respect").alias("respect_count")).rename({"g_respect": "count"})
g_responsibility_count = grading.group_by("g_responsibility").agg(pl.count("g_responsibility").alias("responsibility_count")).rename({"g_responsibility": "count"})

# g_all_count.join(g_fairness_count, on="count", how="inner")
def check_zero_and_concat(dataframe, column_name, basis=5):
    if dataframe.shape[0] < basis:
        t = pl.DataFrame({"count": [1], column_name: [0]})
        t = t.with_columns(
            pl.col("count").cast(pl.Int32),
            pl.col(column_name).cast(pl.UInt32),
        )
        return dataframe.extend(t)
    return dataframe

g_all_count = check_zero_and_concat(g_all_count, "g_all")
g_fairness_count = check_zero_and_concat(g_fairness_count, "fairness_count")
g_honesty_count = check_zero_and_concat(g_honesty_count, "honesty_count")
g_respect_count = check_zero_and_concat(g_respect_count, "respect_count")
g_responsibility_count = check_zero_and_concat(g_responsibility_count, "responsibility_count")

g_agg = g_all_count.join(g_fairness_count, on="count", how="inner").join(g_honesty_count, on="count", how="inner").join(g_respect_count, on="count", how="inner").join(g_responsibility_count, on="count", how="inner").sort("count")

print(grading)

shape: (167, 6)
┌───────┬────────────┬───────────┬───────────┬──────────────────┬────────────────┐
│ g_all ┆ g_fairness ┆ g_honesty ┆ g_respect ┆ g_responsibility ┆ g_transparency │
│ ---   ┆ ---        ┆ ---       ┆ ---       ┆ ---              ┆ ---            │
│ i32   ┆ i32        ┆ i32       ┆ i32       ┆ i32              ┆ i32            │
╞═══════╪════════════╪═══════════╪═══════════╪══════════════════╪════════════════╡
│ 3     ┆ 4          ┆ 3         ┆ 3         ┆ 3                ┆ 2              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ 5     ┆ 4          ┆ 5         ┆ 5         ┆ 4                ┆ 5              │
│ 4     ┆ 4          ┆ 4         ┆ 4         ┆ 4                ┆ 4              │
│ …     ┆ …          ┆ …         ┆ …         ┆ …                ┆ …              │
│ 3     ┆ 4          ┆ 3         ┆ 3         ┆ 3                ┆ 4    

In [285]:
# 
# 라. 조사 대상별 차이 검정
# 
from scipy import stats
personal_info = pl.read_csv("data/6_personal_info.csv")

def group_per_personal(column):
    personality = personal_info.select([pl.col("ID"), pl.col(column)])
    by_area_per_personality = by_area.join(personality, on="ID", how="inner")
    return by_area_per_personality

def compute_group_personality_by_callback(by_area_per_personality, column, callback):
    return by_area_per_personality.group_by([column]).agg( \
        callback("fairness"), \
        callback("honesty"), \
        callback("respect"), \
        callback("responsibility"), \
        callback("transparency"), \
        callback("all")
    ).sort(column)

# target
components = ["fairness", "honesty", "respect", "responsibility", "transparency", "all"]
pl_components = pl.col(components)


# 1) 성별
sex = "Q42"
by_area_per_sex = group_per_personal(sex)

count_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.count)
mean_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.mean)
std_per_sex = compute_group_personality_by_callback(by_area_per_sex, sex, pl.std)

male = by_area_per_sex.filter(pl.col(sex) == 1).select(pl_components)
female = by_area_per_sex.filter(pl.col(sex) == 2).select(pl_components)

ttest = stats.ttest_ind(male, female)
ttest_sex = pl.DataFrame({
    "component": components,
    "t-value": ttest[0],
    "p-value": ttest[1],
})

In [287]:
# 2) 연령
age = "Q43"
by_area_per_age = group_per_personal(age)

count_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.count)
mean_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.mean)
std_per_age = compute_group_personality_by_callback(by_area_per_age, age, pl.std)

age_20 = by_area_per_age.filter(pl.col(age) == 1).select(pl_components)
age_30 = by_area_per_age.filter(pl.col(age) == 2).select(pl_components)
age_40 = by_area_per_age.filter(pl.col(age) == 3).select(pl_components)
age_50 = by_area_per_age.filter(pl.col(age) == 4).select(pl_components)
age_60 = by_area_per_age.filter(pl.col(age) == 5).select(pl_components)

anova = stats.f_oneway(age_20, age_30, age_40, age_50, age_60)
anova_age = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
print(anova_age)

shape: (6, 3)
┌────────────────┬──────────┬──────────┐
│ component      ┆ f-value  ┆ p-value  │
│ ---            ┆ ---      ┆ ---      │
│ str            ┆ f64      ┆ f64      │
╞════════════════╪══════════╪══════════╡
│ fairness       ┆ 0.483742 ┆ 0.747652 │
│ honesty        ┆ 0.309263 ┆ 0.871481 │
│ respect        ┆ 0.037862 ┆ 0.997243 │
│ responsibility ┆ 0.555954 ┆ 0.694954 │
│ transparency   ┆ 0.615545 ┆ 0.65205  │
│ all            ┆ 0.087149 ┆ 0.986335 │
└────────────────┴──────────┴──────────┘


In [289]:
#
# 3) 직위
#
role = "Q44"
by_area_per_role = group_per_personal(role)

count_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.count)
mean_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.mean)
std_per_role = compute_group_personality_by_callback(by_area_per_role, role, pl.std)

researcher = by_area_per_role.filter(pl.col(role) == 1).select(pl_components)
professor = by_area_per_role.filter(pl.col(role) == 2).select(pl_components)
manager = by_area_per_role.filter(pl.col(role) == 3).select(pl_components)
director = by_area_per_role.filter(pl.col(role) == 4).select(pl_components)
executive = by_area_per_role.filter(pl.col(role) == 5).select(pl_components)

anova = stats.f_oneway(researcher, professor, manager, director, executive)
anova_role = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
print(anova_role)

shape: (6, 3)
┌────────────────┬──────────┬──────────┐
│ component      ┆ f-value  ┆ p-value  │
│ ---            ┆ ---      ┆ ---      │
│ str            ┆ f64      ┆ f64      │
╞════════════════╪══════════╪══════════╡
│ fairness       ┆ 1.686617 ┆ 0.15593  │
│ honesty        ┆ 1.010214 ┆ 0.404092 │
│ respect        ┆ 0.849843 ┆ 0.495746 │
│ responsibility ┆ 0.813332 ┆ 0.518483 │
│ transparency   ┆ 0.669397 ┆ 0.614183 │
│ all            ┆ 0.940824 ┆ 0.442077 │
└────────────────┴──────────┴──────────┘


In [292]:
# 
# 4) 학위
# 
degree = "Q45"
by_area_per_degree = group_per_personal(degree)

count_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.count)
mean_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.mean)
std_per_degree = compute_group_personality_by_callback(by_area_per_degree, degree, pl.std)

bachelor = by_area_per_degree.filter(pl.col(degree) == 1).select(pl_components)
master = by_area_per_degree.filter(pl.col(degree) == 2).select(pl_components)
doctor = by_area_per_degree.filter(pl.col(degree) == 3).select(pl_components)
# postdoctor = by_area_per_degree.filter(pl.col(degree) == 4).select(pl_components)

anova = stats.f_oneway(bachelor, master, doctor)
anova_degree = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
print(anova_degree)

shape: (6, 3)
┌────────────────┬──────────┬──────────┐
│ component      ┆ f-value  ┆ p-value  │
│ ---            ┆ ---      ┆ ---      │
│ str            ┆ f64      ┆ f64      │
╞════════════════╪══════════╪══════════╡
│ fairness       ┆ 1.076964 ┆ 0.343025 │
│ honesty        ┆ 1.485125 ┆ 0.229504 │
│ respect        ┆ 1.6164   ┆ 0.20176  │
│ responsibility ┆ 0.149648 ┆ 0.861128 │
│ transparency   ┆ 1.128388 ┆ 0.326053 │
│ all            ┆ 0.917041 ┆ 0.40174  │
└────────────────┴──────────┴──────────┘


In [294]:
# 
# 5) 연구경력
# 

research_experience = "Q46"
by_area_per_research_experience = group_per_personal(research_experience)

count_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.count)
mean_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.mean)
std_per_research_experience = compute_group_personality_by_callback(by_area_per_research_experience, research_experience, pl.std)

experience_1 = by_area_per_research_experience.filter(pl.col(research_experience) == 1).select(pl_components)
experience_2 = by_area_per_research_experience.filter(pl.col(research_experience) == 2).select(pl_components)
experience_3 = by_area_per_research_experience.filter(pl.col(research_experience) == 3).select(pl_components)
experience_4 = by_area_per_research_experience.filter(pl.col(research_experience) == 4).select(pl_components)

anova = stats.f_oneway(experience_1, experience_2, experience_3, experience_4)
anova_research_experience = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
print(anova_research_experience)

shape: (6, 3)
┌────────────────┬──────────┬──────────┐
│ component      ┆ f-value  ┆ p-value  │
│ ---            ┆ ---      ┆ ---      │
│ str            ┆ f64      ┆ f64      │
╞════════════════╪══════════╪══════════╡
│ fairness       ┆ 0.106364 ┆ 0.956271 │
│ honesty        ┆ 0.640982 ┆ 0.589698 │
│ respect        ┆ 0.660098 ┆ 0.577742 │
│ responsibility ┆ 0.451089 ┆ 0.716871 │
│ transparency   ┆ 0.799991 ┆ 0.495544 │
│ all            ┆ 0.119929 ┆ 0.948278 │
└────────────────┴──────────┴──────────┘


In [296]:
# 
# 6) 연구분야
# 

research_field = "Q47"
by_area_per_research_field = group_per_personal(research_field)

count_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.count)
mean_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.mean)
std_per_research_field = compute_group_personality_by_callback(by_area_per_research_field, research_field, pl.std)

field_1 = by_area_per_research_field.filter(pl.col(research_field) == 1).select(pl_components)
field_2 = by_area_per_research_field.filter(pl.col(research_field) == 2).select(pl_components)
field_3 = by_area_per_research_field.filter(pl.col(research_field) == 3).select(pl_components)
field_4 = by_area_per_research_field.filter(pl.col(research_field) == 4).select(pl_components)
field_5 = by_area_per_research_field.filter(pl.col(research_field) == 5).select(pl_components)
field_6 = by_area_per_research_field.filter(pl.col(research_field) == 6).select(pl_components)
field_7 = by_area_per_research_field.filter(pl.col(research_field) == 7).select(pl_components)

anova = stats.f_oneway(field_1, field_2, field_3, field_4, field_5, field_6, field_7)
anova_research_field = pl.DataFrame({
    "component": components,
    "f-value": anova[0],
    "p-value": anova[1],
})
print(anova_research_field)

shape: (6, 3)
┌────────────────┬──────────┬──────────┐
│ component      ┆ f-value  ┆ p-value  │
│ ---            ┆ ---      ┆ ---      │
│ str            ┆ f64      ┆ f64      │
╞════════════════╪══════════╪══════════╡
│ fairness       ┆ 1.389076 ┆ 0.222054 │
│ honesty        ┆ 1.487784 ┆ 0.185486 │
│ respect        ┆ 2.269621 ┆ 0.039515 │
│ responsibility ┆ 1.476822 ┆ 0.189273 │
│ transparency   ┆ 2.315757 ┆ 0.035899 │
│ all            ┆ 2.070589 ┆ 0.059499 │
└────────────────┴──────────┴──────────┘


In [403]:
# 
# elastic net
# 
from attributes import *
import polars_ols as pl_components

# 연구윤리 인식 영역, 연구윤리 교육 영역, 기관에 대한 인식, 인구통계학적 배경 변인
total = pl.read_csv("data/1_total.csv")


target = total.join(by_area, on="ID", how="left")
print(target)

features = total.drop('ID').columns

expr_fairness = pl.col("respect").least_squares.elastic_net(
    *features,
    mode="statistics",
    l1_ratio=0.5,
    alpha=0.1,
    max_iter=1000,
    tol=0.0001,
)

some = target.select(expr_fairness).unnest('statistics')

to_select = pl.DataFrame({
    "features": features,
    "coefficients": some['coefficients'].to_list()[0],
    "p_values": some['p_values'].to_list()[0],
    "t_values": some['t_values'].to_list()[0],
})

selected = to_select.filter(
    ((pl.col("coefficients") > 0) & (pl.col("p_values") < 0.05)).alias("selected")
)

shape: (167, 124)
┌───────┬─────┬─────┬──────┬───┬──────────┬────────────────┬──────────────┬──────────┐
│ ID    ┆ Q1  ┆ Q2  ┆ Q3_1 ┆ … ┆ respect  ┆ responsibility ┆ transparency ┆ all      │
│ ---   ┆ --- ┆ --- ┆ ---  ┆   ┆ ---      ┆ ---            ┆ ---          ┆ ---      │
│ i64   ┆ i64 ┆ i64 ┆ i64  ┆   ┆ f64      ┆ f64            ┆ f64          ┆ f64      │
╞═══════╪═════╪═════╪══════╪═══╪══════════╪════════════════╪══════════════╪══════════╡
│ 14630 ┆ 4   ┆ 7   ┆ 4    ┆ … ┆ 3.5      ┆ 3.5            ┆ 3.0          ┆ 3.45     │
│ 14629 ┆ 4   ┆ 9   ┆ 3    ┆ … ┆ 4.75     ┆ 4.0            ┆ 4.0          ┆ 4.3      │
│ 14626 ┆ 5   ┆ 8   ┆ 4    ┆ … ┆ 3.75     ┆ 4.0            ┆ 4.0          ┆ 3.9      │
│ 14624 ┆ 5   ┆ 8   ┆ 5    ┆ … ┆ 5.0      ┆ 4.25           ┆ 5.0          ┆ 4.7      │
│ 14627 ┆ 5   ┆ 9   ┆ 4    ┆ … ┆ 4.5      ┆ 4.5            ┆ 4.25         ┆ 4.5      │
│ …     ┆ …   ┆ …   ┆ …    ┆ … ┆ …        ┆ …              ┆ …            ┆ …        │
│ 27015 ┆ 4   ┆ 8   ┆ 4  