In [1]:
import pandas as pd
import polars as pl

In [2]:
tweets = pl.read_parquet('./master_fnn.parquet')
users = pl.read_csv('../database/master_users.csv')

In [16]:
import plotly.express as px
fig = px.pie(values=tweets.groupby('label').count()['count'].to_list(), names=['false', 'true'])

fig.update_layout(title='Label Distribution', height=600, width=600)
fig.write_image("./output/label_distribution.png")
fig.show()

In [81]:
label_time_series_true = tweets.select([
    pl.col('label'),
    pl.col('created_at').dt.year(),
]).groupby(['label', 'created_at']).count().sort('created_at').filter(pl.col('label') == 'true')

label_time_series_false = tweets.select([
    pl.col('label'),
    pl.col('created_at').dt.year(),
]).groupby(['label', 'created_at']).count().sort('created_at').filter(pl.col('label') == 'false')

label_time_series = label_time_series_true.join(label_time_series_false, on='created_at', how='outer').with_columns([
    (pl.col('count') / (pl.col('count') + pl.col('count_right')) * 100).round(0).alias('true_percentage')
]).with_columns([
    (100 - pl.col('true_percentage')).alias('false_percentage')
]).select([
    'created_at',
    'true_percentage',
    'false_percentage'
])

label_time_series

created_at,true_percentage,false_percentage
i32,f64,f64
2007,98.0,2.0
2008,84.0,16.0
2009,72.0,28.0
2010,58.0,42.0
2011,45.0,55.0
2012,45.0,55.0
2013,33.0,67.0
2014,32.0,68.0
2015,37.0,63.0
2016,36.0,64.0


In [85]:
import plotly.graph_objects as go
x = label_time_series['created_at']

fig = go.Figure()
fig.add_bar(x=x,y=label_time_series['true_percentage'], name="true", text=label_time_series['true_percentage'] + "%")
fig.add_bar(x=x,y=label_time_series['false_percentage'], name="false", text=label_time_series['false_percentage'] + "%")

fig.update_layout(title='Label distribution across time', height=600, width=800, barmode="relative")
fig.write_image("./output/label_distribution_time_series.png")
fig.show()