In [None]:
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("data/acl/scraping/anthology_info.csv")
df_wr = pd.read_csv("data/acl/scraping/papers_with_code_readme.csv")


In [None]:
df["venue"].unique()


In [None]:
df["count"].sum()


In [None]:
df["count_with_code"].sum()


In [None]:
df["count_with_code"].sum() / df["count"].sum() * 100


In [None]:
df.groupby(["venue"])[["count_with_code", "count"]].sum()


In [None]:
limit_years = list(range(2013, 2022))
filtered_venues = [key for key, value in df.groupby(["venue"])["year"].unique(
).items() if all(elem in value for elem in limit_years)]
df = df.loc[(df["venue"].isin(filtered_venues))
            & (df["year"].isin(limit_years))]


In [None]:
coded_statistics = df.groupby(["year"])[["count", "count_with_code"]].sum()
coded_statistics["percentage"] = coded_statistics["count_with_code"] / \
    coded_statistics["count"] * 100
coded_statistics


In [None]:

count_by_years = df.groupby(["year"])[["count", "count_with_code"]].sum()

# create a new figure and axis
fig, ax = plt.subplots(figsize=(7, 4))

# plot each line with its own marker
ax.plot(count_by_years.index,
        count_by_years["count"], marker='d', linewidth=1, label="Total", linestyle="--")
ax.plot(count_by_years.index,
        count_by_years["count_with_code"], marker='X', linewidth=1, label="Coded", linestyle="--", color="darkgreen")

ax.legend(fontsize=9, frameon=True, edgecolor="gray", loc="upper left")
# ax.set_title("Total and Coded Number of Articles by Year",
#              fontsize=12, fontweight="bold")
plt.xlabel("Year", fontsize=10)
plt.ylabel("Paper Count", fontsize=10)
# plt.xticks(fontsize=11, fontweight="bold")
# plt.yticks(fontsize=11, fontweight="bold")
plt.tight_layout()
plt.savefig("figures/acl_papers.svg", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
count_by_venue = df.groupby(["venue"])[["count_with_code", "count"]].sum()
count_by_venue.sort_values(by=["count_with_code"], inplace=True)
ax = count_by_venue.plot(kind="bar", figsize=(10, 6), fontsize=13)
ax.legend(["Kodlu Makale", "Toplam Makale"], fontsize=12,
          frameon=True, edgecolor="gray", loc="upper left")
ax.set_title("Konferanslara göre makale sayısı",
             fontsize=15, fontweight="bold")
plt.xlabel("Venue", fontsize=13, fontweight="bold")
plt.ylabel("Yıl", fontsize=13, fontweight="bold")
plt.xticks(rotation=45)
plt.yscale("log")
plt.tight_layout()


In [None]:
ratio_by_years = df.groupby(["year"])[["venue", "count_with_code"]].sum().divide(
    count_by_years["count"], axis=0).multiply(100).reset_index().sort_values(by=["count_with_code"])
ratio_by_years.rename(columns={"count_with_code": "ratio"}, inplace=True)
ax = ratio_by_years.plot(kind="line", x="year", y="ratio",
                         marker="o", figsize=(10, 6), fontsize=13, linewidth=3)
ax.set_title("Ratio of Coded Articles To All Articles by Year",
             fontsize=15, fontweight="bold")
ax.set_ylabel("Ratio(%)", fontsize=13, fontweight="bold")
ax.set_xlabel("Year", fontsize=13, fontweight="bold")
ax.legend().remove()
plt.tight_layout()


In [None]:
ratio_by_venue = df.groupby(["venue"])[["venue", "count_with_code"]].sum().divide(
    count_by_venue["count"], axis=0).multiply(100).reset_index().sort_values(by=["count_with_code"])
ratio_by_venue.rename(columns={"count_with_code": "ratio"}, inplace=True)
ax = ratio_by_venue.plot(kind="bar", x="venue",
                         y="ratio", figsize=(10, 6), fontsize=13)
ax.set_title("Konferanslara göre kodlu makalelerin tümüne oranı",
             fontsize=15, fontweight="bold")
ax.set_ylabel("Oran(%)", fontsize=13, fontweight="bold")
ax.set_xlabel("Venue", fontsize=13, fontweight="bold")
ax.legend().remove()
plt.xticks(rotation=45)
plt.tight_layout()
