In [331]:
from attributes import *
import polars as pl
import altair as alt

# get data
df = pl.read_csv("data/2_research_ethic_recognition.csv")

In [332]:
def draw_bar_plot(df, x, y, title, MAP, rotate=True, legend=True):

    bar_x = alt.X(x, title=title, axis=None)
    bar_x = bar_x if MAP is None else bar_x.sort(MAP.values())

    txt_x = alt.X(x, sort=MAP.values()) if MAP is not None else alt.X(x)
    lgd_x = alt.Legend(orient='bottom', direction='horizontal')
    color = alt.Color(x, title=None, legend=lgd_x) if legend else alt.Color(x, title=None, legend=None)
    color = color.sort(MAP.values()) if MAP is not None else color

    bar = df.plot.bar().encode(
        x=bar_x,
        y=alt.Y(y, title=None),
        color=color,
    ).properties(
        title=title,
        width=400,
    )

    text = bar.mark_text(
        align="center",
        baseline="bottom",
    ).encode(
        x=txt_x,
        y=y,
        text=y,
        color=alt.value("black"),
    )
    return bar + text

In [338]:
# 
# 가. 연구자의 연구윤리 인식과 실천의 중요성 인식 
# Research ethic recognition
# 
title = "연구윤리 인식 및 실천의 중요성 인식"

# 1) counting
rer_value_counts = df.select(pl.col("Q1").value_counts().cast(pl.Int64)).unnest("Q1")

# 2) add zero count to 1
add_null = pl.DataFrame({
    "Q1": [1],
    "count": [0]
})
rer_value_counts = rer_value_counts.extend(add_null).sort("Q1")

# 3) replace value
rer_value_counts = rer_value_counts.with_columns(
    Q1 = pl.col("Q1").replace_strict(MAP_IMPORTANCE),
)

rer_value_counts = rer_value_counts.with_columns(
    pct = pl.col("count") / pl.col("count").sum() * 100
)
print(rer_value_counts)
rer_value_counts.write_csv("figure/2/가_연구윤리_인식_및_실천의_중요성_인식.csv")

# 
# plot
# 
bar = draw_bar_plot(rer_value_counts, "Q1:N", "count:Q", title, MAP_IMPORTANCE)
bar.show()
bar.save("figure/2/가_연구윤리_인식_및_실천의_중요성_인식.svg")

shape: (5, 3)
┌────────────────────┬───────┬───────────┐
│ Q1                 ┆ count ┆ pct       │
│ ---                ┆ ---   ┆ ---       │
│ str                ┆ i64   ┆ f64       │
╞════════════════════╪═══════╪═══════════╡
│ 매우 중요하지 않다 ┆ 0     ┆ 0.0       │
│ 중요하지 않다      ┆ 1     ┆ 0.598802  │
│ 보통이다           ┆ 2     ┆ 1.197605  │
│ 중요하다           ┆ 52    ┆ 31.137725 │
│ 매우 중요하다      ┆ 112   ┆ 67.065868 │
└────────────────────┴───────┴───────────┘


In [339]:
# 
# 나. 연구자의 연구윤리 준수 수준 
# Research ethic compliance
# 
title = "연구윤리 준수 수준"
Compilance = df.select(pl.col("Q2").value_counts(sort=True)).unnest("Q2")

# draw table
degree = df.select(
    mean=pl.col("Q2").mean(),
    std=pl.col("Q2").std(),
    count=pl.col("Q2").count(),
)

print(degree)
degree.write_csv("figure/2/나_연구윤리_준수_수준.csv")

# visualization
bar = draw_bar_plot(Compilance, "Q2:N", "count:Q", title, None, legend=False)
bar.show()
bar.save("figure/2/나_연구윤리_준수_수준.svg")

shape: (1, 3)
┌──────────┬──────────┬───────┐
│ mean     ┆ std      ┆ count │
│ ---      ┆ ---      ┆ ---   │
│ f64      ┆ f64      ┆ u32   │
╞══════════╪══════════╪═══════╡
│ 8.497006 ┆ 1.307413 ┆ 167   │
└──────────┴──────────┴───────┘


In [335]:
# 
# 다. 연구자의 연구윤리 준수에 미치는 영향 요인
# Research Ethic Compliance Influencing Factors
# 1) raw data processing
# 
research_ethic_education = df.select(pl.col("Q3_1").alias('v').value_counts()).unnest("v").rename({"count": "eduction"})
research_ethic_broadcast = df.select(pl.col("Q3_2").alias('v').value_counts()).unnest("v").rename({"count": "broadcast"})
communication_mentoring = df.select(pl.col("Q3_3").alias('v').value_counts()).unnest("v").rename({"count": "mentoring"})
regulation_guideline = df.select(pl.col("Q3_4").alias('v').value_counts()).unnest("v").rename({"count": "regulation"})
fraud_verification_sanctions = df.select(pl.col("Q3_5").alias('v').value_counts()).unnest("v").rename({"count": "fraud"})
paper_conference_material = df.select(pl.col("Q3_6").alias('v').value_counts()).unnest("v").rename({"count": "paper"})

# Perform join operations
joined_df = research_ethic_education.join(research_ethic_broadcast, on="v", how="left")
joined_df = joined_df.join(communication_mentoring, on="v", how="left", suffix="k")
joined_df = joined_df.join(regulation_guideline, on="v", how="left")
joined_df = joined_df.join(fraud_verification_sanctions, on="v", how="left")
joined_df = joined_df.join(paper_conference_material, on="v", how="left")
joined_df = joined_df.sort(by="v")
joined_df = joined_df.with_columns(pl.col("v").cast(pl.String))

# 2) visualization
ResearchEthicComplianceInfluencingFactors = joined_df.transpose(include_header=True, column_names="v")

means = df.select(
    "Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6",
)

q3_1 = means.group_by(pl.col("Q3_1")).len().rename({"Q3_1": "중요도", "len": "연구윤리 교육"})
q3_2 = means.group_by(pl.col("Q3_2")).len().rename({"Q3_2": "중요도", "len": "연구윤리 관련 보도"})
q3_3 = means.group_by(pl.col("Q3_3")).len().rename({"Q3_3": "중요도", "len": "연구자 간 대화 및 멘토링"})
q3_4 = means.group_by(pl.col("Q3_4")).len().rename({"Q3_4": "중요도", "len": "관련 규정 및 지침"})
q3_5 = means.group_by(pl.col("Q3_5")).len().rename({"Q3_5": "중요도", "len": "부정행위 검증 및 제재"})
q3_6 = means.group_by(pl.col("Q3_6")).len().rename({"Q3_6": "중요도", "len": "학술지 및 학술 컨퍼런스 자료"})

q3 = q3_1.join(q3_2, on="중요도", how="left") \
    .join(q3_3, on="중요도", how="left") \
    .join(q3_4, on="중요도", how="left") \
    .join(q3_5, on="중요도", how="left") \
    .join(q3_6, on="중요도", how="left") \
    .fill_null(0) \
    .sort(by="중요도") \
    .with_columns(
        중요도=pl.col("중요도")
    )


# print(q3)

# q3_sum = q3.drop("중요도").select(pl.all().sum()).transpose(include_header=True).rename({"column_0": "합계"})
# q3_mean = q3.drop("중요도").select(pl.all().mean()).transpose(include_header=True).rename({"column_0": "평균"})
# q3_std = q3.drop("중요도").select(pl.all().std()).transpose(include_header=True).rename({"column_0": "표준편차"})

# print(q3_mean)

# q3 = q3.transpose(include_header=True, column_names="중요도")
# q3 = q3.join(q3_sum, on="column", how="left") \
#     .join(q3_mean, on="column", how="left") \
#     .join(q3_std, on="column", how="left")
# q3.write_csv("figure/2/다_연구자의_연구윤리_준수에_미치는_영향_요인.csv")
# print(q3)


means = means.transpose(include_header=True)
means.write_csv("figure/2/C_research_ethic_compliance_influencing_factors.csv")

means.plot.bar().encode(
    alt.X("column:N", title="평균"),
    y="column_0:Q",
    color=alt.Color("column:N", legend=None),
).properties(
    title="연구자의 연구윤리 준수에 미치는 영향 요인",
    width=400,
).save("figure/2/C_research_ethic_compliance_influencing_factors.png")

print(q3)

print(means)

df.plot.bar().encode(
    x="sum():Q",
    y="연구윤리 교육:N",
)


shape: (5, 7)
┌────────┬──────────┬───────────┬───────────┬──────────────┬──────────────────┬────────────────┐
│ 중요도 ┆ 연구윤리 ┆ 연구윤리  ┆ 연구자 간 ┆ 관련 규정 및 ┆ 부정행위 검증 및 ┆ 학술지 및 학술 │
│ ---    ┆ 교육     ┆ 관련 보도 ┆ 대화 및   ┆ 지침         ┆ 제재             ┆ 컨퍼런스 자료  │
│ i64    ┆ ---      ┆ ---       ┆ 멘토링    ┆ ---          ┆ ---              ┆ ---            │
│        ┆ u32      ┆ u32       ┆ ---       ┆ u32          ┆ u32              ┆ u32            │
│        ┆          ┆           ┆ u32       ┆              ┆                  ┆                │
╞════════╪══════════╪═══════════╪═══════════╪══════════════╪══════════════════╪════════════════╡
│ 1      ┆ 2        ┆ 5         ┆ 0         ┆ 0            ┆ 0                ┆ 2              │
│ 2      ┆ 4        ┆ 5         ┆ 2         ┆ 5            ┆ 1                ┆ 4              │
│ 3      ┆ 22       ┆ 34        ┆ 11        ┆ 14           ┆ 14               ┆ 30             │
│ 4      ┆ 88       ┆ 86        ┆ 76        ┆ 77           ┆ 93        

In [341]:
# 
# 라. KIOST 연구윤리 확립을 위한 자체 노력의 정도
# Effort for KIOST Research Ethic Establishment
# 
effort_value_counts = df.select(
    pl.col("Q5").value_counts(),
).unnest("Q5")

effort_value_counts = effort_value_counts.with_columns(
    pct = pl.col("count") / pl.col("count").sum() * 100
)

effort_value_counts = effort_value_counts.with_columns(
    Q5 = pl.col("Q5").replace_strict(MAP_GOOD),
)

effort_value_counts.write_csv("figure/2/라_KIOST_연구윤리_확립을_위한_자체_노력의_정도.csv")

bar = draw_bar_plot(effort_value_counts, "Q5:N", "count:Q", "KIOST 연구윤리 확립을 위한 자체 노력의 정도", MAP_GOOD)
bar.save("figure/2/라_KIOST_연구윤리_확립을_위한_자체_노력의_정도.svg")

In [337]:
from scipy import stats

#
# Chi-square test
#
personal_info = pl.read_csv("data/6_personal_info.csv")
recognition_stats = df.join(personal_info, on="ID")

def run_chi2_test(target_name):
    chi_analysis = {}
    for key, value in MAP_PERSON.items():
        target = recognition_stats.pivot(target_name, index=value, values=target_name, aggregate_function='count', sort_columns=True).fill_null(0).drop(value)
        chi_analysis[key] = stats.chi2_contingency(target)
    return chi_analysis

chi_recognition = run_chi2_test("Q1")
chi_compliance = run_chi2_test("Q2")
chi_compliance_influencing_factor_1 = run_chi2_test("Q3_1")
chi_compliance_influencing_factor_2 = run_chi2_test("Q3_2")
chi_compliance_influencing_factor_3 = run_chi2_test("Q3_3")
chi_compliance_influencing_factor_4 = run_chi2_test("Q3_4")
chi_compliance_influencing_factor_5 = run_chi2_test("Q3_5")
chi_compliance_influencing_factor_6 = run_chi2_test("Q3_6")
chi_effort = run_chi2_test("Q5")

  target = recognition_stats.pivot(target_name, index=value, values=target_name, aggregate_function='count', sort_columns=True).fill_null(0).drop(value)
