# Flourish CSVs

1. Export from Python
2. Import to Flourish
3. Build Charts

_Formatting CSVs to match the Flourish templates._

In [None]:
import warnings
from pathlib import Path
import inspect
import sys
import plotly.express as px
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
from itertools import chain

warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None

In [4]:
path_to_file = "../scraped_articles_enriched_full.csv"
df = pd.read_csv(path_to_file)

## Line Chart -> Loanword Density Over Time (2016 - 2024)

In [6]:
# For Slide 4, 11, 12
df_line = df.groupby("year")["loanword_density"].mean().reset_index()
df_line.to_csv("loanword_density_by_year.csv", index=False)

## Confidence Interval + Loanword Usage Over Time

In [44]:
summary = df.groupby("year")["loanword_density"].agg(
    ["mean", "count", "std"]
).reset_index()

summary["sem"] = summary["std"] / np.sqrt(summary["count"])
summary["ci95_low"] = summary["mean"] - 1.96 * summary["sem"]
summary["ci95_high"] = summary["mean"] + 1.96 * summary["sem"]

summary = summary.rename(
    columns={
        "year": "Year",
        "mean": "Avg Loanword Density",
        "count": "Article Count",
        "ci95_low": "CI Lower",
        "ci95_high": "CI Upper"
    }
)

summary.to_csv("loanword_density_confidence_flourish.csv", index=False)

## Histogram -> Loanword Density Speed

_Histograms are not available on Flourish_

In [20]:
df["density_bin"] = pd.cut(df["loanword_density"], bins=10)
df_hist = df["density_bin"].value_counts().sort_index().reset_index()

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,article_id,url,source_site,date,year,headline,word_count,text,loanwords,top_loanwords,sentiment,loanword_count,loanword_density,density_bin
0,0,0,https://www.businessinsider.de/wirtschaft/fina...,businessinsider.de,2022-10-14,2022,Wenn ihr euren Wohlstand im Alter sichern woll...,729,Die Niedrigzinspolitik wirkt sich auf die Alte...,[thema],['thema'],negative,9,0.012346,"(-0.000857, 0.0959]"
1,1,1,https://www.businessinsider.de/karriere/arbeit...,businessinsider.de,2022-10-17,2022,Narzisstische Führungskräfte überschätzen sich...,1089,"Wem es an Informationen mangelt, braucht ein g...","[informationen, professorin, empathie, team, e...","['team', 'feedback', 'informationen']",neutral,128,0.117539,"(0.0959, 0.192]"
2,2,2,https://www.businessinsider.de/gruenderszene/k...,businessinsider.de,2022-10-12,2022,Zwei CEOs statt eines: So hat dieses Startup d...,161,Whistleblowing Mehr von Gründerszene Folgen Ko...,[whistleblowing],['whistleblowing'],negative,18,0.111801,"(0.0959, 0.192]"
3,3,3,https://www.businessinsider.de/insider-picks/t...,businessinsider.de,2022-10-13,2022,"Die besten Serien, die ihr nur bei Amazon Prim...",3185,Disclaimer: Dieser Artikel enthält Affiliate-L...,"[blockbuster, testphase, york, gear, homecomin...","['york', 'homecoming', 'blockbuster']",neutral,136,0.0427,"(-0.000857, 0.0959]"
4,4,4,https://www.businessinsider.de/politik/deutsch...,businessinsider.de,2022-10-12,2022,Lieferung der Iris-T: Was kann das moderne Flu...,814,"Die Ukraine hat bestätigt, die erste Einheit d...",[eurofighter],['eurofighter'],neutral,15,0.018428,"(-0.000857, 0.0959]"


In [27]:
# For Slide 5
df["density_bin"] = pd.cut(df["loanword_density"], bins=20)
# df_hist = df["density_bin"].value_counts().sort_index().reset_index()
df_hist.columns = ["density_bin", "article_count"]
df_hist.to_csv("loanword_density_histogram_v3.csv", index=False)

In [34]:
df_hist

Unnamed: 0,label,article_count
0,"(-0.05, 0.06]",34626
1,"(0.06, 0.17]",45199
2,"(0.17, 0.28]",19922
3,"(0.28, 0.39]",6667
4,"(0.39, 0.5]",1725
5,"(0.5, 0.61]",358
6,"(0.61, 0.72]",101
7,"(0.72, 0.83]",16
8,"(0.83, 0.94]",3
9,"(0.94, 1.05]",1


In [42]:
bin_edges = np.linspace(-0.05, 1.05, 11)
bin_labels = [f"{bin_edges[i]:.2f} - {bin_edges[i+1]:.2f}" for i in range(len(bin_edges)-1)]

df["density_bin"] = pd.cut(
    df["loanword_density"], 
    bins=bin_edges, 
    labels=bin_labels,
    include_lowest=True
)

bin_counts = df["density_bin"].value_counts().sort_index().reset_index()
bin_counts.columns = ["Loanword Density Range", "Article Count"]

bin_counts.to_csv("loanword_density_histogram_bins10.csv", index=False)

## Boxplot or Bar -> Topic vs. Loanword Usage

_As only one domain use topic_

In [10]:
# For Slide 6
# df_topic = df.groupby("topic")["loanword_density"].mean().reset_index()
# df_topic.to_csv("loanword_density_by_topic.csv", index=False)

## Sentiment Correlation -> Boxplot / Bar

In [18]:
# For Slide 7
df_sentiment = df.groupby("sentiment")["loanword_density"].mean().reset_index()
df_sentiment.to_csv(
    "loanword_density_by_sentiment.csv", index=False)

In [None]:
grouped = df.groupby(["year", "sentiment"])[
    "loanword_density"].mean().reset_index()

pivot_df = grouped.pivot(
    index="year",
    columns="sentiment",
    values="loanword_density"
).reset_index()

pivot_df = pivot_df.sort_values("year")

output_path = "loanword_density_by_sentiment_breakdown.csv"

pivot_df.to_csv(output_path, index=False)

## Top Loanwords -> Horizontal Bar Chart

In [None]:
df["loanwords"] = df["loanwords"].apply(literal_eval)

In [16]:
# For Slide 8
all_words = list(chain.from_iterable(df["loanwords"]))
word_freq = Counter(all_words).most_common(20)
df_top = pd.DataFrame(word_freq, columns=["loanword", "frequency"])
df_top.to_csv("top_loanwords_v2.csv", index=False)

## Forecasting Future Usage

In [46]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import numpy as np

df = df[(df["year"] >= 2016) & (df["year"] <= 2023)]

grouped = df.groupby("year")["loanword_density"].mean().reset_index()

X = grouped["year"].values.reshape(-1, 1)
y = grouped["loanword_density"].values

model = LinearRegression()
model.fit(X, y)

future_years = np.array(list(range(2016, 2028))).reshape(-1, 1)
forecast = model.predict(future_years)

forecast_df = pd.DataFrame({
    "Year": future_years.flatten(),
    "Forecast Loanword Density": forecast
})

actuals = grouped.rename(
    columns={
        "loanword_density": "Actual Loanword Density",
        "year": "Year"
    }
)

flourish_df = pd.merge(forecast_df, actuals, on="Year", how="left")

flourish_df.to_csv("loanword_forecast_flourish.csv", index=False)