# Visualisation with Plotly

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import plotly.express as px
from collections import Counter
from itertools import chain
from ast import literal_eval
from scipy import stats
import plotly.graph_objects as go

import sys
import inspect

import warnings
warnings.filterwarnings('ignore')

In [2]:
path_to_file = "scraped_articles_enriched_full.csv"
df = pd.read_csv(path_to_file)

## Loan Density Over Time
_Line Chart_

In [3]:
df_line = df.groupby("year")["loanword_density"].mean().reset_index()

fig = px.line(
    df_line,
    x="year",
    y="loanword_density",
    title="Loanword Density Over Time",
    markers=True,
    labels={
        "loanword_density": "Avg. Loanword Density"
    }
)

fig.show()

Shows a rising trend with potential dip after 2020.

In [4]:
df["year"].unique()

array([2022, 2020, 2021, 2018, 2017, 2016, 2015, 2019, 2025, 2024, 2011,
       2012, 2013, 2014, 2023])

In [5]:
df_agg = df.groupby("year")["loanword_density"].mean().reset_index()

density_2015 = df_agg[df_agg["year"] == 2016]["loanword_density"].values[0]
density_2022 = df_agg[df_agg["year"] == 2023]["loanword_density"].values[0]

increase = density_2022 / density_2015
print(f"Loanword usage increased by a factor of: {increase:.2f}")

Loanword usage increased by a factor of: 1.10


## Histogram

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,article_id,url,source_site,date,year,headline,word_count,text,loanwords,top_loanwords,sentiment,loanword_count,loanword_density
0,0,0,https://www.businessinsider.de/wirtschaft/fina...,businessinsider.de,2022-10-14,2022,Wenn ihr euren Wohlstand im Alter sichern woll...,729,Die Niedrigzinspolitik wirkt sich auf die Alte...,['thema'],['thema'],negative,9,0.012346
1,1,1,https://www.businessinsider.de/karriere/arbeit...,businessinsider.de,2022-10-17,2022,Narzisstische Führungskräfte überschätzen sich...,1089,"Wem es an Informationen mangelt, braucht ein g...","['informationen', 'professorin', 'empathie', '...","['team', 'feedback', 'informationen']",neutral,128,0.117539
2,2,2,https://www.businessinsider.de/gruenderszene/k...,businessinsider.de,2022-10-12,2022,Zwei CEOs statt eines: So hat dieses Startup d...,161,Whistleblowing Mehr von Gründerszene Folgen Ko...,['whistleblowing'],['whistleblowing'],negative,18,0.111801
3,3,3,https://www.businessinsider.de/insider-picks/t...,businessinsider.de,2022-10-13,2022,"Die besten Serien, die ihr nur bei Amazon Prim...",3185,Disclaimer: Dieser Artikel enthält Affiliate-L...,"['blockbuster', 'testphase', 'york', 'gear', '...","['york', 'homecoming', 'blockbuster']",neutral,136,0.0427
4,4,4,https://www.businessinsider.de/politik/deutsch...,businessinsider.de,2022-10-12,2022,Lieferung der Iris-T: Was kann das moderne Flu...,814,"Die Ukraine hat bestätigt, die erste Einheit d...",['eurofighter'],['eurofighter'],neutral,15,0.018428


In [7]:
fig = px.histogram(
    df,
    x="loanword_density",
    nbins=10,  
    title="Distribution of Loanword Density",
    labels={"loanword_density": "Loanword Density",
            "count": "Number of Articles"},
    template="plotly_white"
)

fig.update_layout(
    bargap=0.1,
    xaxis=dict(tickformat=".2f"),
    yaxis_title="Article Count"
)

fig.show()

Shows articles with low loanword usage -> long tail toward higher values

## Average Density by Topic - Bar Chart

Shows certain topics such as tech likely highest

## Confidence Interval

In [8]:
summary = df.groupby("year")["loanword_density"].agg(
    ["mean", "count", "std"]
).reset_index()

summary["sem"] = summary["std"] / np.sqrt(summary["count"])
summary["ci95_low"] = summary["mean"] - 1.96 * summary["sem"]
summary["ci95_high"] = summary["mean"] + 1.96 * summary["sem"]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=pd.concat([summary["year"], summary["year"][::-1]]),
    y=pd.concat([summary["ci95_high"], summary["ci95_low"][::-1]]),
    fill="toself",
    fillcolor="rgba(0, 123, 255, 0.2)",
    line=dict(color="rgba(255, 255, 255, 0)"),
    hoverinfo="skip",
    showlegend=False,
    name="95% CI"
))

fig.add_trace(go.Scatter(
    x=summary["year"],
    y=summary["mean"],
    mode="lines+markers",
    name="Average Density",
    line=dict(color="blue")
))

fig.update_layout(
    title="Loanword Usage Over Time with 95% Confidence Interval",
    xaxis_title="Year",
    yaxis_title="Average Loanword Density",
    template="plotly_white",
    height=500
)

fig.show()

In [9]:
summary = df.groupby("year")["loanword_density"].agg(
    ["mean", "count", "std"]
).reset_index()

summary["sem"] = summary["std"] / np.sqrt(summary["count"])
summary["ci95_low"] = summary["mean"] - 1.96 * summary["sem"]
summary["ci95_high"] = summary["mean"] + 1.96 * summary["sem"]

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=pd.concat([summary["year"], summary["year"][::-1]]),
    y=pd.concat([summary["ci95_high"], summary["ci95_low"][::-1]]),
    fill="toself",
    fillcolor="rgba(0, 123, 255, 0.2)",
    line=dict(color="rgba(255, 255, 255, 0)"),
    hoverinfo="skip",
    showlegend=False,
    name="95% CI"
))

fig.add_trace(go.Scatter(
    x=summary["year"],
    y=summary["mean"],
    mode="lines+markers",
    name="Average Density",
    line=dict(color="blue")
))

fig.update_layout(
    yaxis=dict(title="Avg Loanword Density"),
    yaxis2=dict(
        title="Article Count",
        overlaying='y',
        side='right',
        showgrid=False
    )
)

fig.show()

In [10]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Grouping and stats
summary = df.groupby("year")["loanword_density"].agg(
    mean="mean", count="count", std="std"
).reset_index()

summary["sem"] = summary["std"] / np.sqrt(summary["count"])
summary["ci95_low"] = summary["mean"] - 1.96 * summary["sem"]
summary["ci95_high"] = summary["mean"] + 1.96 * summary["sem"]

# Create figure
fig = go.Figure()

# 95% Confidence Interval
fig.add_trace(go.Scatter(
    x=pd.concat([summary["year"], summary["year"][::-1]]),
    y=pd.concat([summary["ci95_high"], summary["ci95_low"][::-1]]),
    fill="toself",
    fillcolor="rgba(193, 128, 157, 0.4)",
    line=dict(color="rgba(255, 255, 255, 0)"),
    hoverinfo="skip",
    name="95% Confidence Interval",
    showlegend=True
))

# Mean line
fig.add_trace(go.Scatter(
    x=summary["year"],
    y=summary["mean"],
    mode="lines+markers",
    name="Average Loanword Density",
    line=dict(color="rgba(63, 51, 77, 1)")
))

# Article count as secondary bar chart
fig.add_trace(go.Bar(
    x=summary["year"],
    y=summary["count"],
    name="Article Count",
    yaxis="y2",
    opacity=0.6,
    marker=dict(color="rgb(214, 64, 37)")
))

# Layout
fig.update_layout(
    #title="Loanword Usage Over Time with Confidence Interval (2011–2025)",
    xaxis=dict(title="Year"),
    yaxis=dict(title="Avg Loanword Density"),
    yaxis2=dict(
        title="Article Count",
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(x=0.01, y=0.99),
    template="plotly_white",
    height=500
)

fig.show()

## Sentiment vs. Loanword Density
_Bar chart_

In [11]:
df.isna().sum()

Unnamed: 0          0
article_id          0
url                 0
source_site         0
date                0
year                0
headline            0
word_count          0
text                0
loanwords           0
top_loanwords       0
sentiment           0
loanword_count      0
loanword_density    0
dtype: int64

In [12]:
df_sentiment = df.groupby("sentiment")["loanword_density"].mean().reset_index()

fig = px.bar(
    df_sentiment,
    x="sentiment",
    y=df_sentiment["loanword_density"],
    title="Loanword Density by Sentiment",
    labels={
        "avg_density": "Avg. Loanword Density"
    }
)

fig.show()

Shows more English words in articles is seen as more positive

In [13]:
# !pip install ace_tools

In [14]:
sentiment = df["sentiment"].unique()

In [15]:
grouped = df.groupby(["year", "sentiment"])[
    "loanword_density"].mean().reset_index()

pivot_df = grouped.pivot(
    index="year",
    columns="sentiment",
    values="loanword_density"
)

In [16]:
pivot_df = grouped.pivot(index="year", columns="sentiment",
                         values="loanword_density").reset_index()

In [17]:
pivot_df.head()

sentiment,year,negative,neutral,positive
0,2011,0.189818,0.23623,0.196462
1,2012,0.174908,0.228774,0.208842
2,2013,0.190026,0.222681,0.209566
3,2014,0.17365,0.206533,0.15627
4,2015,0.160412,0.174061,0.126971


In [18]:
#import ace_tools as tools
import plotly.graph_objects as go

grouped = df.groupby(["year", "sentiment"])["loanword_density"].mean().reset_index()

pivot_df = grouped.pivot(
    index="year",
    columns="sentiment",
    values="loanword_density"
).reset_index()

fig = go.Figure()

df["sentiment"]

for value in sentiment:
    fig.add_trace(go.Scatter(
        x=pivot_df["year"],
        y=pivot_df[value],
        mode="lines+markers",
        name=value.capitalize()
    ))

fig.update_layout(
    title="Average Loanword Density Over Time by Sentiment",
    xaxis_title="Year",
    yaxis_title="Avg. Loanword Density",
    legend_title="sentiment",
    template="plotly_white",
    height=500
)

# tools.display_dataframe_to_user(
#     name="Loanword Density by Year and Sentiment",
#     dataframe=grouped
# )

fig.show()

## Top Loanwords
_Horizontal Bar Chart_

In [19]:
df["loanwords"] = df["loanwords"].apply(literal_eval)

all_words = list(chain.from_iterable(df["loanwords"]))
word_freq = Counter(all_words).most_common(20)
df_top = pd.DataFrame(word_freq, columns=["loanword", "frequency"])

fig = px.bar(
    df_top.sort_values("frequency", ascending=True),
    x="frequency",
    y="loanword",
    orientation="h",
    title="Top English loanwords in German Articles",
    labels={
        "loanwords": "Loanwords",
        "frequency": "Frequency"
    }
)

fig.show()

Show top loanwords

## Forcasting Future Usage

Line and forecast to 2026

In [20]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import numpy as np

# df = df[(df["year"] >= 2011) & (df["loanword_density"] != 1)]
df = df[(df["year"] >= 2016) & (df["year"] <= 2023)]
#df = df[(df["year"] >= 2016) & (df["year"] <= 2019)]

grouped = df.groupby("year")["loanword_density"].mean().reset_index()

X = grouped["year"].values.reshape(-1, 1)
y = grouped["loanword_density"].values

model = LinearRegression()
model.fit(X, y)

future_years = np.array(list(range(2016, 2028))).reshape(-1, 1)
forecast = model.predict(future_years)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=grouped["year"],
    y=grouped["loanword_density"],
    mode="lines+markers",
    name="Actual",
    line=dict(color="blue")
))

fig.add_trace(go.Scatter(
    x=future_years.flatten(),
    y=forecast,
    mode="lines+markers",
    name="Forecast",
    line=dict(dash="dash", color="orange")
))

fig.update_layout(
    title="Forecasting Loanword Usage to 2026",
    xaxis_title="Year",
    yaxis_title="Avg. Loanword Density",
    template="plotly_white"
)

fig.show()