In [1]:
import seaborn as sns
import pandas as pd
import datetime

sns.set(style="darkgrid")

# Loading datasets

In [2]:
def filter_langs(df: pd.DataFrame) -> pd.DataFrame:
    tweets_pl = df[df.lang == 'pl']
    tweets_en = df[df.lang == 'en']
    tweets_de = df[df.lang == 'de']
    tweets_fr = df[df.lang == 'fr']
    return tweets_pl, tweets_en, tweets_de, tweets_fr

In [7]:
tweets_df = pd.read_csv('../../data/classified_tweets_no_pca.tsv', sep='\t', converters={'target': str, 'id_str': str, 'user_id_str': str}, index_col=None)
sentiment_df = pd.read_json('../../data/b24_analyzed.jsonl', lines=True)
users = pd.read_json('../../data/users.jsonl', lines=True, dtype=False)


tweets_df.created_at = pd.to_datetime(tweets_df.created_at)
tweets_df['sentiment'] = sentiment_df.sentiment_b24
tweets_df = tweets_df[tweets_df.created_at > datetime.datetime(2021, 8, 1)]
tweets_df = tweets_df[tweets_df.in_reply_to_screen_name.isnull()]
tweets_df = tweets_df.merge(users[['id_str', 'screen_name', 'name']], how='left', right_on='id_str', left_on='user_id_str').drop(columns=['id_str_y'])
tweets_df = tweets_df.rename(columns={'id_str_x': 'id_str'})
tweets_pl, tweets_en, tweets_de, tweets_fr = filter_langs(tweets_df)

In [4]:
tweets_df = tweets_df.drop_duplicates(subset=['id_str'])

# Stance over time

In [5]:
def group_by_weeks(df: pd.DataFrame, col: str, freq: str) -> pd.DataFrame:  
    grouped = df.groupby([pd.Grouper(key="created_at", freq=freq), col]).agg({col: 'count'}).rename(columns={col: 'count'})
    new_df = grouped.reset_index()
    new_df['percentage'] = grouped.groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).reset_index()['count']
    return new_df

In [6]:
weeks = group_by_weeks(tweets_df, 'target', '1W')
weeks_pl = group_by_weeks(tweets_pl, 'target', '1W')
weeks_en = group_by_weeks(tweets_en, 'target', '1W')
weeks_de = group_by_weeks(tweets_de, 'target', '1W')

## Extreme

In [75]:
tweets_extreme = tweets_df[tweets_df['target'] != '0']
tweets_extreme_pl, tweets_extreme_en, tweets_extreme_de, tweets_extreme_fr = filter_langs(tweets_extreme)

In [8]:
weeks_extreme_pl = group_by_weeks(tweets_extreme_pl, 'target', '1W')
weeks_extreme_de = group_by_weeks(tweets_extreme_de, 'target', '1W')
weeks_extreme_en = group_by_weeks(tweets_extreme_en, 'target', '1W')

In [10]:
import plotly.express as px


def print_area(df: pd.DataFrame, title: str, path: str) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy.loc[df_copy['target'] == '-1', 'target'] = 'Przeciw'
    df_copy.loc[df_copy['target'] == '1', 'target'] = 'Za'

    fig = px.area(df_copy, x='created_at', y='percentage',
                  color='target', height=400, width=800, color_discrete_sequence=["red", "green"],
                  labels={
                      "created_at": '',
                      "percentage": "Procentowy podział (%)",
                      "target": "Stanowisko"
                  },
                  title=title)
    fig.update_xaxes(
        tickformat="%d-%m-%Y",
        tickangle=30,
        showgrid=False)
    fig.update_yaxes(
        showgrid=True,
        gridcolor='rgba(0, 0, 0, 0.1)'
    )
    fig.update_layout({
        'legend_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()
    fig.write_image(path, format='png')

print_area(weeks_extreme_pl, 'Zmiana nastawienia w czasie (tweety w języku polskim)', '../../data/charts/perc_pl.png')

In [59]:
print_area(weeks_extreme_de, 'Zmiana nastawienia w czasie (tweety w języku niemieckim)', '../../data/charts/perc_de.png')

In [60]:
print_area(weeks_extreme_en, 'Zmiana nastawienia w czasie (tweety w języku angielskim)', '../../data/charts/perc_en.png')

In [11]:
def plot_line(df: pd.DataFrame, title: str):
    df_copy = df.copy()
    df_copy.loc[df_copy['target'] == '-1', 'target'] = 'Przeciw'
    df_copy.loc[df_copy['target'] == '1', 'target'] = 'Za'

    fig = px.line(df_copy, x="created_at", y="count", color='target',
                  height=400, width=800, color_discrete_sequence=["red", "green"],
                  labels={
                      "created_at": '',
                      "count": "Liczba tweetów",
                      "target": "Stanowisko"
                  },
                  title=title
                )
    fig.update_traces(line=dict(color="red", width=5), selector=dict(name='Przeciw'))
    fig.update_traces(line=dict(color="green", width=5), selector=dict(name='Za'))
    fig.update_xaxes(
        tickformat="%d-%m-%Y",
        tickangle=30,
        showgrid=False,
        showline=True, linewidth=1, linecolor='rgba(0, 0, 0, 0.1)'
        )
    fig.update_yaxes(
        showgrid=True,
        gridcolor='rgba(0, 0, 0, 0.1)'
    )
    fig.update_layout({
        'legend_bgcolor': 'rgba(0, 0, 0, 0)',
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()
    fig.write_image('../../data/charts/lines.png', format='png')

plot_line(weeks_extreme_pl, 'Tygodniowa liczba tweetów na temat sytuacji na granicy')

## Cumulative stance in countries

In [49]:
tweets_extreme_pl.target.value_counts(normalize=True)

1     0.836011
-1    0.163989
Name: target, dtype: float64

In [51]:
tweets_extreme_de.target.value_counts(normalize=True)

1     0.827322
-1    0.172678
Name: target, dtype: float64

In [76]:
tweets_extreme_fr.target.value_counts(normalize=True)

1     0.676791
-1    0.323209
Name: target, dtype: float64

# Sentiment

In [30]:
tweets_pl[tweets_pl.target == '-1'].sentiment.value_counts(normalize=True)

-1    0.757844
 0    0.146587
 1    0.095569
Name: sentiment, dtype: float64

In [31]:
tweets_pl[tweets_pl.target == '1'].sentiment.value_counts(normalize=True)

-1    0.521519
 1    0.285473
 0    0.193009
Name: sentiment, dtype: float64

In [29]:
tweets_pl[tweets_pl.target == '0'].sentiment.value_counts()


 0    0.461487
-1    0.427148
 1    0.111365
Name: sentiment, dtype: float64

# Specific media accounts

In [49]:
tvn = tweets_pl[tweets_pl.screen_name == 'tvn24']
tvp = tweets_pl[tweets_pl.screen_name == 'tvp_info']
polsat = tweets_pl[tweets_pl.screen_name == 'PolsatNewsPL']

In [48]:
tvn.target.value_counts(normalize=True)

1     0.467492
0     0.433437
-1    0.099071
Name: target, dtype: float64

In [47]:
tvp.target.value_counts(normalize=True)

1     0.508333
0     0.479167
-1    0.012500
Name: target, dtype: float64

In [50]:
polsat.target.value_counts(normalize=True)

1     0.548571
0     0.365714
-1    0.085714
Name: target, dtype: float64

# Language distribution

In [29]:
langs = tweets_df.lang.value_counts().rename_axis('lang').reset_index(name='counts')
langs = langs[(langs.lang != 'und') & (langs.counts > 800)]

In [37]:
fig = px.bar(langs, x='lang', y='counts', width=400, height=300,
                      labels={
                          "lang": 'Język',
                          "counts": "Liczba tweetów",
                      },
                      color_discrete_sequence=["#6b705c"],
                      )
fig.update_xaxes(
    showgrid=False,
    showline=True, linewidth=1, linecolor='rgba(0, 0, 0, 0.1)'
)
fig.update_yaxes(
    showgrid=True,
    gridcolor='rgba(0, 0, 0, 0.1)'
)
fig.update_layout({
    'legend_bgcolor': 'rgba(0, 0, 0, 0)',
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
})
fig.show()
fig.write_image('../../data/charts/langs.png', format='png')
