In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("data/euvsdisinfo.csv")
df

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].min()

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].max()

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].mean()

In [None]:
df["article_text"].str.len().mean()

In [None]:
df["summary"].str.len().mean()

In [None]:
df["disproof"].str.len().mean()

In [None]:
sns.countplot(x="article_language", data=df, hue="class")

In [None]:
# Top publishers divided by class
top_publishers = df['article_publisher'].value_counts().head(50).index
filtered_df = df[df['article_publisher'].isin(top_publishers)]
class_counts = filtered_df.groupby(['article_publisher', 'class']).size().reset_index(name='counts')
pivot_df = class_counts.pivot(index='article_publisher', columns='class', values='counts')
pivot_df = pivot_df.reindex(pivot_df.sum(axis=1).sort_values(ascending=False).index)
color_mapping = {"support": "green", "misinformation": "red"}
pivot_df.loc[:, color_mapping.keys()].plot(kind='bar', stacked=True, color=[color_mapping[i] for i in color_mapping.keys()], figsize=(10, 5))
plt.xticks(rotation=90)
plt.show()

In [None]:
df_misinfo = df[df['class'] == 'misinformation']
df_misinfo["keywords"].dropna().apply(lambda x: x.split(",")).explode().str.strip().value_counts().head(50).plot(kind='bar', figsize=(10, 5))

In [None]:
top_topics = df_misinfo["keywords"].dropna().apply(lambda x: x.split(",")).explode().str.strip().value_counts().head(15)
top_publishers = df['article_publisher'].value_counts().head(15).index
topics_publishers = {}

for row in df_misinfo[["article_publisher", "keywords"]].dropna().apply(lambda x: {x["article_publisher"]: [k.strip() for k in x["keywords"].split(",")]}, axis=1):
    for publisher, topics in row.items():
        for topic in topics:
            if topic in top_topics:
                if topic not in topics_publishers:
                    topics_publishers[topic] = []
                else:
                    topics_publishers[topic].append(publisher)

# count the number of occurences of each publisher for each topic
topic_publisher_counts = {}
for topic, publishers in topics_publishers.items():
    topic_publisher_counts[topic] = {}
    for publisher in publishers:
        if publisher not in topic_publisher_counts[topic]:
            topic_publisher_counts[topic][publisher] = 0
        topic_publisher_counts[topic][publisher] += 1
# transform the counts into percentages
topic_publisher_percentages = {}
for topic, publisher_counts in topic_publisher_counts.items():
    total = sum(publisher_counts.values())
    topic_publisher_percentages[topic] = {publisher: count / total for publisher, count in publisher_counts.items()}
    
# keep the top 5 publishers for each topic, and aggregate the rest into a new key called "Others"
topic_publisher_percentages_top5 = {}
for topic, publisher_percentages in topic_publisher_percentages.items():
    topic_publisher_percentages_top5[topic] = {}
    top5_publishers = sorted(publisher_percentages.items(), key=lambda x: x[1], reverse=True)[:5]
    for publisher, percentage in top5_publishers:
        topic_publisher_percentages_top5[topic][publisher] = percentage
    topic_publisher_percentages_top5[topic]["Others"] = sum(publisher_percentages.values()) - sum([percentage for publisher, percentage in top5_publishers])
# plot the results
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(20, 20))
for i, (topic, publisher_percentages) in enumerate(topic_publisher_percentages_top5.items()):
    ax = axes[i // 3, i % 3]
    ax.set_title(topic)
    ax.pie(publisher_percentages.values(), labels=publisher_percentages.keys(), autopct='%1.1f%%')