In [1]:
import plotly.express as px
import seaborn as sns
import pandas as pd
import datetime

sns.set(style="darkgrid")

#### Loading datasets

In [2]:
tweets_df = pd.read_json('../../data/classified_tweets.jl', lines=True)
users_df = pd.read_json('../../data/users.jl', lines=True, dtype=False)
# sentiment analyzed by Brand24 company
sentiment_df = pd.read_json('../../data/b24_pl_tweets_sentiment.jl', lines=True)

tweets_df.created_at = pd.to_datetime(tweets_df.created_at)
tweets_df['sentiment'] = sentiment_df.sentiment_b24
tweets_df = tweets_df[tweets_df.created_at > datetime.datetime(2021, 8, 1)]
tweets_df = tweets_df[tweets_df.in_reply_to_screen_name.isnull()]

tweets_df['id_str'] = tweets_df['id_str'].astype('str')
tweets_df['user_id_str'] = tweets_df['user_id_str'].astype('str')

tweets_df = tweets_df.merge(users_df[['id_str', 'screen_name', 'name']], how='left', right_on='id_str', left_on='user_id_str')
tweets_df = tweets_df.rename(columns={'id_str_x': 'id_str'}).drop('id_str_y', axis=1)
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])

#### Analyzing languages distribution

In [3]:
def filter_langs(df: pd.DataFrame) -> pd.DataFrame:
    tweets_pl = df[df.lang == 'pl']
    tweets_en = df[df.lang == 'en']
    tweets_de = df[df.lang == 'de']
    tweets_fr = df[df.lang == 'fr']
    return tweets_pl, tweets_en, tweets_de, tweets_fr

In [4]:
langs = tweets_df.lang.value_counts().rename_axis('lang').reset_index(name='counts')
langs = langs[(langs.lang != 'und') & (langs.counts > 800)]

fig = px.bar(langs, x='lang', y='counts', width=400, height=300,
                      labels={
                          "lang": 'Język',
                          "counts": "Liczba tweetów",
                      },
                      color_discrete_sequence=["#6b705c"],
                      )
fig.update_xaxes(
    showgrid=False,
    showline=True, linewidth=1, linecolor='rgba(0, 0, 0, 0.1)'
)
fig.update_yaxes(
    showgrid=True,
    gridcolor='rgba(0, 0, 0, 0.1)'
)
fig.update_layout({
    'legend_bgcolor': 'rgba(0, 0, 0, 0)',
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
})
fig.show()
fig.write_image('./figures/tweets_languages.pdf', format='pdf')

In [5]:
tweets_pl, tweets_en, tweets_de, tweets_fr = filter_langs(tweets_df)

#### Analyzing stance over time

In [6]:
def group_by_weeks(df: pd.DataFrame, col: str, freq: str) -> pd.DataFrame:  
    grouped = df.groupby([pd.Grouper(key="created_at", freq=freq), col]).agg({col: 'count'}).rename(columns={col: 'count'})
    new_df = grouped.reset_index()
    new_df['percentage'] = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).reset_index()['count']
    return new_df

In [7]:
weeks = group_by_weeks(tweets_df, 'target', '1W')
weeks_pl = group_by_weeks(tweets_pl, 'target', '1W')
weeks_en = group_by_weeks(tweets_en, 'target', '1W')
weeks_de = group_by_weeks(tweets_de, 'target', '1W')

In [8]:
# filtering only extreme stances
tweets_extreme = tweets_df[tweets_df['target'] != 0]
tweets_extreme_pl, tweets_extreme_en, tweets_extreme_de, tweets_extreme_fr = filter_langs(tweets_extreme)

In [9]:
weeks_extreme = group_by_weeks(tweets_extreme, 'target', '1W')
weeks_extreme_pl = group_by_weeks(tweets_extreme_pl, 'target', '1W')
weeks_extreme_de = group_by_weeks(tweets_extreme_de, 'target', '1W')
weeks_extreme_en = group_by_weeks(tweets_extreme_en, 'target', '1W')

In [10]:
def plot_area(df: pd.DataFrame, title: str, path: str) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy.loc[df_copy['target'] == '-1', 'target'] = 'Przeciw'
    df_copy.loc[df_copy['target'] == '1', 'target'] = 'Za'

    fig = px.area(df_copy, x='created_at', y='percentage',
                  color='target', height=400, width=800, color_discrete_sequence=["red", "green"],
                  labels={
                      "created_at": '',
                      "percentage": "Procentowy podział (%)",
                      "target": "Stanowisko"
                  },
                  title=title)
    fig.update_xaxes(
        tickformat="%d-%m-%Y",
        tickangle=30,
        showgrid=False)
    fig.update_yaxes(
        showgrid=True,
        gridcolor='rgba(0, 0, 0, 0.1)'
    )
    fig.update_layout({
        'legend_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()
    fig.write_image(path, format='pdf')

In [11]:
plot_area(weeks_extreme_pl, 'Zmiana nastawienia w czasie (tweety w języku polskim)', './figures/stance_over_time_tweets_pl.pdf')

In [12]:
plot_area(weeks_extreme_en, 'Zmiana nastawienia w czasie (tweety w języku angielskim)', './figures/stance_over_time_tweets_en.pdf')

In [13]:
plot_area(weeks_extreme_de, 'Zmiana nastawienia w czasie (tweety w języku niemieckim)', './figures/stance_over_time_tweets_de.pdf')

In [14]:
plot_area(weeks_extreme, 'Zmiana nastawienia w czasie', './figures/stance_over_time_tweets.pdf')

In [15]:
def plot_line(df: pd.DataFrame, title: str, path: str):
    df_copy = df.copy()
    df_copy.loc[df_copy['target'] == '-1', 'target'] = 'Przeciw'
    df_copy.loc[df_copy['target'] == '1', 'target'] = 'Za'

    fig = px.line(df_copy, x="created_at", y="count", color='target',
                  height=400, width=800, color_discrete_sequence=["red", "green"],
                  labels={
                      "created_at": '',
                      "count": "Liczba tweetów",
                      "target": "Stanowisko"
                  },
                  title=title
                )
    fig.update_traces(line=dict(color="red", width=5), selector=dict(name='Przeciw'))
    fig.update_traces(line=dict(color="green", width=5), selector=dict(name='Za'))
    fig.update_xaxes(
        tickformat="%d-%m-%Y",
        tickangle=30,
        showgrid=False,
        showline=True, linewidth=1, linecolor='rgba(0, 0, 0, 0.1)'
        )
    fig.update_yaxes(
        showgrid=True,
        gridcolor='rgba(0, 0, 0, 0.1)'
    )
    fig.update_layout({
        'legend_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()
    fig.write_image(path, format='pdf')

In [16]:
plot_line(weeks_extreme_pl, 'Tygodniowa liczba tweetów na temat sytuacji na granicy', './figures/number_of_tweets_pl.pdf')

#### Cumulative stance in countries

In [17]:
print("Polish stance")
print(tweets_extreme_pl['target'].value_counts(normalize=True))
print()
print("English stance")
print(tweets_extreme_en['target'].value_counts(normalize=True))
print()
print("German stance")
print(tweets_extreme_de['target'].value_counts(normalize=True))
print()
print("French stance")
print(tweets_extreme_fr['target'].value_counts(normalize=True))

Polish stance
 1    0.836004
-1    0.163996
Name: target, dtype: float64

English stance
 1    0.6826
-1    0.3174
Name: target, dtype: float64

German stance
 1    0.827322
-1    0.172678
Name: target, dtype: float64

French stance
 1    0.658017
-1    0.341983
Name: target, dtype: float64


#### Biggest polish media stance

In [18]:
polsat = tweets_pl[tweets_pl.screen_name == 'PolsatNewsPL']
tvn = tweets_pl[tweets_pl.screen_name == 'tvn24']
tvp = tweets_pl[tweets_pl.screen_name == 'tvp_info']

In [19]:
print("PLSAT stance")
print(polsat['target'].value_counts(normalize=True))
print()
print("TVN stance")
print(tvn['target'].value_counts(normalize=True))
print()
print("TVP stance")
print(tvp['target'].value_counts(normalize=True))

PLSAT stance
 1    0.548571
 0    0.365714
-1    0.085714
Name: target, dtype: float64

TVN stance
 1    0.467492
 0    0.433437
-1    0.099071
Name: target, dtype: float64

TVP stance
 1    0.508333
 0    0.479167
-1    0.012500
Name: target, dtype: float64


#### Polish tweets sentiment analysis

In [20]:
print("Sentiment in negative stance")
print(tweets_pl[tweets_pl["target"] == -1]["sentiment"].value_counts(normalize=True))
print()
print("Sentiment in positive stance")
print(tweets_pl[tweets_pl["target"] == 1]["sentiment"].value_counts(normalize=True))
print()
print("Sentiment in neutral stance")
print(tweets_pl[tweets_pl["target"] == 0]["sentiment"].value_counts(normalize=True))
print()

Sentiment in negative stance
-1    0.757844
 0    0.146587
 1    0.095569
Name: sentiment, dtype: float64

Sentiment in positive stance
-1    0.521496
 1    0.285486
 0    0.193018
Name: sentiment, dtype: float64

Sentiment in neutral stance
 0    0.461487
-1    0.427148
 1    0.111365
Name: sentiment, dtype: float64

