In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
import numpy as np

In [None]:
df = pd.read_csv("../../data/euvsdisinfo_full.csv")
df

In [None]:
df["class"].value_counts()

In [None]:
df["debunk_id"].nunique()

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].min()

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].max()

In [None]:
df.groupby("debunk_id")[["class"]].value_counts().reset_index().groupby("class")["count"].mean()

In [None]:
df["article_text"].str.len().mean()

In [None]:
df["keywords"].dropna().apply(lambda x: len(x.split(","))).mean()

In [None]:
sns.countplot(x="article_language", data=df, hue="class")
plt.xticks(rotation=90)

In [None]:
# create a dataset with the total number of articles per language, and the distribution of classes for each language. Insert 0 if there are no articles of a certain class for a certain language
languages = df["article_language"].unique()
classes = df["class"].unique().tolist()
total_articles = []
class_distributions = []

for language in languages:
    total_articles.append(len(df[df["article_language"] == language]))
    for class_ in classes:
        class_distributions.append(
            len(df[(df["article_language"] == language) & (df["class"] == class_)])
        )

class_distributions = np.array(class_distributions).reshape(
    len(languages), len(classes)
)

distributions_df = pd.DataFrame({"total": class_distributions.sum(1), "disinformation": class_distributions[:,1], "support": class_distributions[:,0]}, index=languages
).sort_values("total", ascending=False)
distributions_df

In [None]:
distributions_df["total"].quantile([0.25, 0.5, 0.75])

In [None]:
len(languages)

In [None]:
# calculate the percentage of supporting articles per language
(df[df["class"] == "support"]["article_language"].value_counts() / df["article_language"].value_counts()).sort_values(ascending=False)

In [None]:
dates = pd.to_datetime(df["debunk_date"], format="%d-%m-%Y")
print(dates.min(),"||", dates.max())
period = relativedelta(dates.max(), dates.min())
print(period)

In [None]:
df["debunk_date"] = pd.to_datetime(df["debunk_date"], format="%d-%m-%Y")

# extract year and quarter from debunk_date column
df['year'] = pd.to_datetime(df['debunk_date'], format='%d-%m-%Y').dt.year
df['quarter'] = pd.to_datetime(df['debunk_date'], format='%d-%m-%Y').dt.quarter

# create a new column that combines year and quarter information
df['year_quarter'] = df['year'].astype(str) + '-Q' + df['quarter'].astype(str)

# group data by year_quarter and class, and plot stacked distribution.
grouped_df = df.groupby(['year_quarter', 'class']).size().unstack().fillna(0)

# reindex the DataFrame to include missing dates with a count of zero
all_dates = pd.date_range(start=df['debunk_date'].min(), end=df['debunk_date'].max(), freq='Q')
grouped_df = grouped_df.reindex([d.split("-")[0] + "-Q" + str(int(d.split("-")[1])//4 +1) for d in  all_dates.strftime('%Y-%m')])

grouped_df.plot(kind='bar', stacked=True)
plt.xlabel('Year and Quarter')
plt.ylabel('Count')
plt.title('Distribution of Data by Year and Quarter and Class')
plt.show()


In [None]:
df_fake = df[df["class"] == "disinformation"]
kw_ot_df = df_fake[["keywords", "debunk_date"]].dropna()
kw_ot_df["keywords"] = kw_ot_df["keywords"].apply(lambda x: x.split(","))
kw_ot_df = kw_ot_df.explode("keywords")
kw_ot_df["keywords"] = kw_ot_df["keywords"].str.strip()
kw_ot_df["keywords"] = kw_ot_df["keywords"].str.capitalize()
print("Unique Topics", kw_ot_df["keywords"].nunique())
print(kw_ot_df["keywords"].value_counts().to_markdown())

In [None]:
ukraine_kw = [
    "War in ukraine",
    "Crimea",
    "Invasion of ukraine",
    "Donbas",
    "Illegal annexation"
    "Ukraine",
    "Eastern ukraine",
    "War crimes",
    "Ukrainian statehood"
]

covid_kw = [
    "Coronavirus",
    "Vaccination",
    "Biological weapons",
    "Chemical weapons/attack",
    "Conspiracy theory",
    "Laboratory",
    "Virus / bacteria threat"
]

west_kw = [
    "West",
    "Nato",
    "European union",
    "International law",
    "Us presence in europe",
    "Eu/nato enlargement",
    "Europe",
    "United nations",
]

russia_kw = [
    "Anti-russian",
    "Russophobia",
    "Alexei navalny",
    "Encircling russia",
    "Destabilising russia",
    "Diplomacy with russia",
    "Ussr",
    "Russian world"


]

In [None]:
map_ukraine = {
    "War in ukraine": "War in Ukraine",
    "Crimea": "Crimea",
    "Invasion of ukraine": "Invasion of Ukraine",
    "Donbas": "Donbas",
    "Illegal annexation": "Illegal annexation",
    "Ukraine": "Ukraine",
    "Eastern ukraine": "Eastern Ukraine",
    "War crimes": "War crimes",
    "Ukrainian statehood": "Ukrainian statehood"
}

map_west = {
    "West": "West",
    "Nato": "NATO",
    "European union": "European Union",
    "International law": "International law",
    "Us presence in europe": "US presence in Europe",
    "Eu/nato enlargement": "EU/NATO enlargement",
    "Europe": "Europe",
    "United nations": "United Nations"
}

map_covid = {
    "Coronavirus": "Coronavirus",
    "Vaccination": "Vaccination",
    "Biological weapons": "Biological weapons",
    "Chemical weapons/attack": "Chemical weapons/attack",
    "Conspiracy theory": "Conspiracy theory",
    "Laboratory": "Laboratory",
    "Virus / bacteria threat": "Virus/bacteria threat"
}

map_russia = {
    "Anti-russian": "Anti-Russian",
    "Russophobia": "Russophobia",
    "Alexei navalny": "Alexei Navalny",
    "Encircling russia": "Encircling Russia",
    "Destabilising russia": "Destabilising Russia",
    "Diplomacy with russia": "Diplomacy with Russia",
    "Ussr": "USSR",
    "Russian world": "Russian world"
}

mapping = {k: v for d in [map_ukraine, map_west, map_covid, map_russia] for k, v in d.items()}

In [None]:
year_quarters = sorted(df["year_quarter"].unique())[:-1]

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10, 19))

big_fontsize = 20
medium_fontsize = 17

# Create a shared y-axis label
fig.text(-0.04, 0.5, "# Disinformation Articles", va='center', rotation='vertical', fontsize=big_fontsize)

for i, top_kw in enumerate([covid_kw, west_kw, russia_kw, ukraine_kw]):
    df_fake = df[df["class"] == "disinformation"]
    kw_ot_df = df_fake[["keywords", "debunk_date"]].dropna()
    kw_ot_df["keywords"] = kw_ot_df["keywords"].apply(lambda x: x.split(","))
    kw_ot_df = kw_ot_df.explode("keywords")
    kw_ot_df["keywords"] = kw_ot_df["keywords"].str.strip()
    kw_ot_df["keywords"] = kw_ot_df["keywords"].str.capitalize()
    kw_ot_df = kw_ot_df[kw_ot_df["keywords"].isin(top_kw)]
    kw_ot_df["keywords"] = kw_ot_df["keywords"].map(mapping)

    kw_ot_df["debunk_date"] = pd.to_datetime(kw_ot_df["debunk_date"], format="%Y-%m-%d")
    kw_ot_df["debunk_date"] = kw_ot_df["debunk_date"].dt.year.astype(str) + "-Q" + kw_ot_df["debunk_date"].dt.quarter.astype(str)
    kw_ot_df = kw_ot_df[kw_ot_df["debunk_date"] != "2023-Q3"]  # remove last quarter of 2023 because it is incomplete
    kw_ot_df.set_index("debunk_date", inplace=True)
    kw_ot_df= kw_ot_df.pivot_table(columns="keywords", aggfunc="size", index="debunk_date").fillna(0)
    kw_ot_df = kw_ot_df.reindex(year_quarters, fill_value=0)

    ax = axes[i]  # Get the correct subplot axes

    kw_ot_df.plot(ax=ax)
    ax.set_xlabel("")

    title = "COVID-19" if i == 0 else "West" if i == 1 else "Russia" if i == 2 else "Ukraine"
    ax.set_title(title, fontsize=big_fontsize)
    ax.title.set_weight('bold')
    # Set x-axis tick positions
    ax.set_xticks(range(len(kw_ot_df.index)))

    # Set x-axis tick labels with increased font size
    ax.set_xticklabels(kw_ot_df.index, rotation=90, fontsize=medium_fontsize)

    # Set y-axis tick labels with increased font size
    ax.tick_params(axis='y', labelsize=medium_fontsize)

    if top_kw == covid_kw:
        ax.axvline(x=kw_ot_df.index.tolist().index("2020-Q1"), color="black", linestyle="--", linewidth=2)
        # Annotate the vertical line as "Pandemic"
        ax.annotate(
            "Start of COVID-19 Pandemic",
            xy=(kw_ot_df.index.tolist().index("2020-Q1"), 250),
            xytext=(kw_ot_df.index.tolist().index("2020-Q1") +0.3 , 270),
            arrowprops=dict(arrowstyle="->", color="black"),
            color="black",
            fontsize=18,
        )
    
    else:
        ax.axvline(x=kw_ot_df.index.tolist().index("2022-Q1"), color="black", linestyle="--", linewidth=2)
        # Annotate the vertical line as "Ukraine Invasion"
        ax.annotate(
            "Invasion of Ukraine",
            xy=(kw_ot_df.index.tolist().index("2022-Q1"), 250),
            xytext=(kw_ot_df.index.tolist().index("2022-Q1") - 14, 270),
            arrowprops=dict(arrowstyle="->", color="black"),
            color="black",
            fontsize=20,
        )   

    # Show vertical grid lines
    # ax.xaxis.grid(True)

    # Show xticks only for the last plot
    if i != 3:
        ax.set_xticklabels([])

# Adjust the spacing between subplots and make the spacing for xticks the same for all axes
plt.tight_layout(pad=1.0, h_pad=1.0)

# Reduce the size of the legend box
for ax in axes:
    ax.legend(prop={'size': 15})

for ax in axes:
    ax.set_ylim(0, 300)

# make all plots have the same number of xticks
for ax in axes:
    ax.set_xticks(range(len(kw_ot_df.index)))
    ax.set_xlim(0, len(kw_ot_df.index))
# save as pdf
plt.savefig("keywords_over_time.pdf", bbox_inches="tight")

In [None]:
df_ngrams = df[(df["year_quarter"] == "2021-Q3") & (df["class"] == "disinformation")]
df_ngrams["article_language"].value_counts()

In [None]:
class_counts = df.groupby(['article_publisher', 'class']).size().reset_index(name='counts')
topn = 25
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 15))

misinfo_df = class_counts[class_counts['class'] == 'disinformation']
misinfo_df = misinfo_df.sort_values(by='counts', ascending=True).tail(topn)
ax1.barh(misinfo_df['article_publisher'], misinfo_df['counts'], color='red')
ax1.set_title("Disinformation", fontsize=23)
ax1.title.set_weight('bold')
ax1.set_yticklabels(misinfo_df['article_publisher'], fontsize=20)
ax1.tick_params(axis='x', labelsize=20) 

support_df = class_counts[class_counts['class'] == 'support']
support_df = support_df.sort_values(by='counts', ascending=True).tail(topn)
ax2.barh(support_df['article_publisher'], support_df['counts'], color='green')
ax2.set_title("Trustworthy", fontsize=23)
ax2.title.set_weight('bold')
ax2.set_yticklabels(support_df['article_publisher'], fontsize=20)
ax2.tick_params(axis='x', labelsize=20)

plt.tight_layout(pad=1.0, h_pad=1.0)
plt.savefig("top_publishers.pdf", bbox_inches="tight")

In [None]:
class_counts = df.groupby(['article_publisher', 'class']).size().reset_index(name='counts')
misinfo_df = class_counts[class_counts['class'] == 'disinformation']
counts = misinfo_df[misinfo_df['article_publisher'].str.contains("ria")]["counts"].sum()
total = misinfo_df["counts"].sum()

print(counts/total)

In [None]:
class_counts[class_counts['class'] == 'support']["article_publisher"]

In [None]:
misinfo_df["counts"].sum()

In [None]:
df["article_publisher"].nunique()

In [None]:
df[df["class"] == "disinformation"]["article_publisher"].nunique()

In [None]:
df[df["class"] == "support"]["article_publisher"].nunique()

In [None]:
df_misinfo = df[df['class'] == 'disinformation']
df_misinfo["keywords"].dropna().apply(lambda x: x.split(",")).explode().str.strip().value_counts().head(50).plot(kind='bar', figsize=(10, 5))

In [None]:
top_topics = df_misinfo["keywords"].dropna().apply(lambda x: x.split(",")).explode().str.strip().value_counts().head(15)
top_publishers = df['article_language'].value_counts().head(15).index
topics_publishers = {}

for row in df_misinfo[["article_language", "keywords"]].dropna().apply(lambda x: {x["article_language"]: [k.strip() for k in x["keywords"].split(",")]}, axis=1):
    for publisher, topics in row.items():
        for topic in topics:
            if topic in top_topics:
                if topic not in topics_publishers:
                    topics_publishers[topic] = []
                else:
                    topics_publishers[topic].append(publisher)

# count the number of occurences of each publisher for each topic
topic_publisher_counts = {}
for topic, publishers in topics_publishers.items():
    topic_publisher_counts[topic] = {}
    for publisher in publishers:
        if publisher not in topic_publisher_counts[topic]:
            topic_publisher_counts[topic][publisher] = 0
        topic_publisher_counts[topic][publisher] += 1
# transform the counts into percentages
topic_publisher_percentages = {}
for topic, publisher_counts in topic_publisher_counts.items():
    total = sum(publisher_counts.values())
    topic_publisher_percentages[topic] = {publisher: count / total for publisher, count in publisher_counts.items()}
    
# keep the top 5 publishers for each topic, and aggregate the rest into a new key called "Others"
topic_publisher_percentages_top5 = {}
for topic, publisher_percentages in topic_publisher_percentages.items():
    topic_publisher_percentages_top5[topic] = {}
    top5_publishers = sorted(publisher_percentages.items(), key=lambda x: x[1], reverse=True)[:5]
    for publisher, percentage in top5_publishers:
        topic_publisher_percentages_top5[topic][publisher] = percentage
    topic_publisher_percentages_top5[topic]["Others"] = sum(publisher_percentages.values()) - sum([percentage for publisher, percentage in top5_publishers])

In [None]:
dicts = df_misinfo[["article_language", "keywords"]].dropna().apply(lambda x: {x["article_language"]: [k.strip() for k in x["keywords"].split(",")]}, axis=1)
k = 8
top_keywords = df_misinfo["keywords"].dropna().apply(lambda x: x.split(",")).explode().str.strip().value_counts()[:k].keys()
top_languages = df_misinfo["article_language"].value_counts().head(9).keys()
counts = {}
for d in dicts:
    for lang, keywords in d.items():
        if lang in top_languages:
            if lang not in counts:
                counts[lang] = {}
            for keyword in keywords:
                if keyword in top_keywords:
                    if keyword not in counts[lang]:
                        counts[lang][keyword] = 0
                    counts[lang][keyword] += 1

In [None]:
# plot 10 pie charts for each language. keep the colors consistent for each keyword across different charts
colors = sns.color_palette("Paired", k+1)
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(30, 15))
for i, (lang, keyword_counts) in enumerate(counts.items()):
    ax = axes[i // 3, i % 3]
    ax.set_title(lang, fontsize=25)  # Increase the font size of the title
    # make the title bold
    ax.title.set_weight('bold')

    # sort the keyword_counts dictionary by keys
    keyword_counts = dict(sorted(keyword_counts.items()))
    
    ax.pie(keyword_counts.values(), labels=keyword_counts.keys(), autopct='%1.1f%%', colors=[colors[top_keywords.get_loc(keyword)] for keyword in keyword_counts.keys()], textprops={'fontsize': 25})

plt.tight_layout()
plt.savefig("top_keywords_per_language.pdf", bbox_inches="tight")

In [None]:
d = {}
top_15_languages = df["article_language"].value_counts().head(15).index.tolist()
df_top_15_languages = df[df["article_language"].isin(top_15_languages)]
# d["language"] = df_top_15_languages.groupby('article_language')['keywords'].apply(lambda x: x.str.split(',').explode().str.strip().value_counts().index[0]).index.tolist()
for i in range(3):
    d[f"top{i+1}"] = df_top_15_languages.groupby('article_language')['keywords'].apply(lambda x: x.str.split(',').explode().str.strip().value_counts().index[i]).values.tolist()

pd.DataFrame(d, index = top_15_languages)