In [15]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# Set font properties
plt.rcParams['font.family'] = 'serif'  # Use a serif font
plt.rcParams['font.serif'] = ['Times New Roman']  # Specify the serif font
plt.rcParams['font.size'] = 12  # Set the font size
plt.rcParams['axes.titlesize'] = 14  # Set the font size for axes titles
plt.rcParams['axes.labelsize'] = 12  # Set the font size for axes labels

from matplotlib import colormaps


In [16]:
from attributes import *
import polars as pl
import altair as alt

# get data
df = pl.read_csv("data/2_research_ethic_recognition.csv")

In [59]:
# 
# 가. 연구자의 연구윤리 인식과 실천의 중요성 인식 
# Research ethic recognition
# 
title = "연구윤리 인식 및 실천의 중요성 인식"

# 1) counting
rer_value_counts = df.select(pl.col("Q1").value_counts().cast(pl.Int64)).unnest("Q1")

print(df)
print(df.group_by("Q1").len())

# 2) add zero count to 1
add_null = pl.DataFrame({
    "Q1": [1],
    "count": [0]
})
rer_value_counts = rer_value_counts.extend(add_null).sort("Q1")

# 3) replace value
rer_value_counts = rer_value_counts.with_columns(
    Q1 = pl.col("Q1").replace_strict(MAP_IMPORTANCE),
)

rer_value_counts = rer_value_counts.with_columns(
    pct = pl.col("count") / pl.col("count").sum() * 100
)

rer_value_counts.write_csv("figure/2/A_research_ethic_recognition.csv")

# 
# plot
# 
rer_value_counts.plot.bar().encode(
    alt.X("Q1:N", title="연구자의 연구윤리 인식").sort(MAP_IMPORTANCE.values()),
    y="count",
    color=alt.Color("Q1:N", legend=None),
).properties(
    title=title,
    width=400,
).save("figure/2/A_research_ethic_recognition.png")


shape: (167, 11)
┌───────┬─────┬─────┬──────┬───┬──────┬──────┬────────────────────────────────┬─────┐
│ ID    ┆ Q1  ┆ Q2  ┆ Q3_1 ┆ … ┆ Q3_5 ┆ Q3_6 ┆ Q4                             ┆ Q5  │
│ ---   ┆ --- ┆ --- ┆ ---  ┆   ┆ ---  ┆ ---  ┆ ---                            ┆ --- │
│ i64   ┆ i64 ┆ i64 ┆ i64  ┆   ┆ i64  ┆ i64  ┆ str                            ┆ i64 │
╞═══════╪═════╪═════╪══════╪═══╪══════╪══════╪════════════════════════════════╪═════╡
│ 14630 ┆ 4   ┆ 7   ┆ 4    ┆ … ┆ 4    ┆ 4    ┆ .                              ┆ 3   │
│ 14629 ┆ 4   ┆ 9   ┆ 3    ┆ … ┆ 3    ┆ 4    ┆ Research Method and Material   ┆ 4   │
│ 14626 ┆ 5   ┆ 8   ┆ 4    ┆ … ┆ 4    ┆ 4    ┆ null                           ┆ 4   │
│ 14624 ┆ 5   ┆ 8   ┆ 5    ┆ … ┆ 4    ┆ 4    ┆ Personal Conscience and Ethics ┆ 3   │
│ 14627 ┆ 5   ┆ 9   ┆ 4    ┆ … ┆ 4    ┆ 4    ┆ null                           ┆ 4   │
│ …     ┆ …   ┆ …   ┆ …    ┆ … ┆ …    ┆ …    ┆ …                              ┆ …   │
│ 27015 ┆ 4   ┆ 8   ┆ 4    ┆ … ┆ 4   

In [18]:
# 
# 나. 연구자의 연구윤리 준수 수준 
# Research ethic compliance
# 
title = "연구윤리 준수 수준"
Compilance = df.select(pl.col("Q2").value_counts(sort=True)).unnest("Q2")

# draw table
degree = df.select(
    mean=pl.col("Q2").mean(),
    std=pl.col("Q2").std(),
    count=pl.col("Q2").count(),
)

degree.write_csv("figure/2/B_research_ethic_compliance.csv")

# visualization
Compilance.plot.bar().encode(
    alt.X("Q2:N", title="연구자의 연구윤리 인식"),
    y="count:Q",
    color=alt.Color("Q2:N", legend=None),
).properties(
    title=title,
    width=400,
).save("figure/2/B_research_ethic_compliance.png")

In [19]:
# 
# 다. 연구자의 연구윤리 준수에 미치는 영향 요인
# Research Ethic Compliance Influencing Factors
# 1) raw data processing
# 
research_ethic_education = df.select(pl.col("Q3_1").alias('v').value_counts()).unnest("v").rename({"count": "eduction"})
research_ethic_broadcast = df.select(pl.col("Q3_2").alias('v').value_counts()).unnest("v").rename({"count": "broadcast"})
communication_mentoring = df.select(pl.col("Q3_3").alias('v').value_counts()).unnest("v").rename({"count": "mentoring"})
regulation_guideline = df.select(pl.col("Q3_4").alias('v').value_counts()).unnest("v").rename({"count": "regulation"})
fraud_verification_sanctions = df.select(pl.col("Q3_5").alias('v').value_counts()).unnest("v").rename({"count": "fraud"})
paper_conference_material = df.select(pl.col("Q3_6").alias('v').value_counts()).unnest("v").rename({"count": "paper"})

# Perform join operations
joined_df = research_ethic_education.join(research_ethic_broadcast, on="v", how="left")
joined_df = joined_df.join(communication_mentoring, on="v", how="left", suffix="k")
joined_df = joined_df.join(regulation_guideline, on="v", how="left")
joined_df = joined_df.join(fraud_verification_sanctions, on="v", how="left")
joined_df = joined_df.join(paper_conference_material, on="v", how="left")
joined_df = joined_df.sort(by="v")
joined_df = joined_df.with_columns(pl.col("v").cast(pl.String))

# 2) visualization
ResearchEthicComplianceInfluencingFactors = joined_df.transpose(include_header=True, column_names="v")

means = df.select(
    pl.col(["Q3_1", "Q3_2", "Q3_3", "Q3_4", "Q3_5", "Q3_6"]).mean(),
)
means = means.transpose(include_header=True)
means.write_csv("figure/2/C_research_ethic_compliance_influencing_factors.csv")

means.plot.bar().encode(
    alt.X("column:N", title="평균"),
    y="column_0:Q",
    color=alt.Color("column:N", legend=None),
).properties(
    title="연구자의 연구윤리 준수에 미치는 영향 요인",
    width=400,
).save("figure/2/C_research_ethic_compliance_influencing_factors.png")

In [20]:
# 
# 라. KIOST 연구윤리 확립을 위한 자체 노력의 정도
# Effort for KIOST Research Ethic Establishment
# 
effort_value_counts = df.select(
    pl.col("Q5").value_counts(),
).unnest("Q5")

effort_value_counts = effort_value_counts.with_columns(
    pct = pl.col("count") / pl.col("count").sum() * 100
)

effort_value_counts = effort_value_counts.with_columns(
    Q5 = pl.col("Q5").replace_strict(MAP_GOOD),
)

effort_value_counts.write_csv("figure/2/D_effort_for_kiost_research_ethic_establishment.csv")

effort_value_counts.plot.bar().encode(
    alt.X("Q5:N", title="KIOST 연구윤리 확립을 위한 자체 노력의 정도").sort(MAP_GOOD.values()),
    y="count",
    color=alt.Color("Q5:N", legend=None),
).properties(
    title="KIOST 연구윤리 확립을 위한 자체 노력의 정도",
    width=400,
).save("figure/2/D_effort_for_kiost_research_ethic_establishment.png")

In [118]:
from scipy import stats

#
# Chi-square test
#
personal_info = pl.read_csv("data/6_personal_info.csv")
recognition_stats = df.join(personal_info, on="ID")

def run_chi2_test(target_name):
    chi_analysis = {}
    for key, value in MAP_PERSON.items():
        target = recognition_stats.pivot(target_name, index=value, values=target_name, aggregate_function='count', sort_columns=True).fill_null(0).drop(value)
        chi_analysis[key] = stats.chi2_contingency(target)
    return chi_analysis

chi_recognition = run_chi2_test("Q1")
chi_compliance = run_chi2_test("Q2")
chi_compliance_influencing_factor_1 = run_chi2_test("Q3_1")
chi_compliance_influencing_factor_2 = run_chi2_test("Q3_2")
chi_compliance_influencing_factor_3 = run_chi2_test("Q3_3")
chi_compliance_influencing_factor_4 = run_chi2_test("Q3_4")
chi_compliance_influencing_factor_5 = run_chi2_test("Q3_5")
chi_compliance_influencing_factor_6 = run_chi2_test("Q3_6")
chi_effort = run_chi2_test("Q5")

  target = recognition_stats.pivot(target_name, index=value, values=target_name, aggregate_function='count', sort_columns=True).fill_null(0).drop(value)
