In [1]:
import pandas as pd
import numpy as np
import datetime

### Loading datasets

Load and merge tweets and users datasets.

In [2]:
tweets = pd.read_json('../../data/tweets.jl', lines=True, dtype=False)
users = pd.read_json('../../data/users.jl', lines=True, dtype=False)

tweets['id'] = tweets['id_str']
tweets = tweets.merge(users[['id_str', 'screen_name']], how='left', right_on='id_str', left_on='user_id_str').drop(columns=['id_str_x', 'id_str_y'])
tweets = tweets.drop_duplicates(subset=['id'])

tweets['full_text'] = tweets['full_text'].str.replace('\n', ' ')

Select only polish tweets since 01.08.2021 for annotation (exclude 'wieszwięcej')

In [3]:
tweets_pl = tweets[tweets['lang'] == 'pl']
tweets_pl = tweets_pl[tweets_pl['created_at'] > datetime.datetime(2021, 8, 1)]
tweets_pl = tweets_pl[~tweets_pl['full_text'].str.contains('wieszwięcej')]

### Media tweets

Calculate how many tweets in the dataset are published by the most popular polish media websites.

In [4]:
media = ["tvn24", "FaktyTVN", "tvp_info", "PolsatNewsPL", "wPolityce_pl", "OnetWiadomosci", "wirtualnapolska", "gazeta_wyborcza", "se_pl", "niezaleznapl", "gazetapl_news", "rzeczpospolita", "RMF24pl", "RadioZET_NEWS", "NewsweekPolska", "natematpl", "gazetaprawnapl"]

media_tweets = tweets_pl[tweets_pl['screen_name'].isin(media)]
non_media_tweets = tweets_pl[~tweets_pl['screen_name'].isin(media)]
media_tweets.shape[0]/tweets_pl.shape[0]

0.03128491620111732

### Tweets' distribution over months

Analyse the distribution of tweets over the months to get representative distribution of each period.

In [5]:
def get_month_dist(df: pd.DataFrame, col: str) -> np.array:
  month_df = df[[col]].groupby(df[col].dt.month).agg(count=(col, 'count')).reset_index()
  month_count = month_df['count'].to_numpy()
  percentage = month_count / month_count.sum()
  
  return percentage

All polish tweets

In [6]:
percentage = get_month_dist(tweets_pl, 'created_at')
percentage

array([0.24581006, 0.11731844, 0.13296089, 0.43910615, 0.06480447])

Media polish tweets

In [7]:
percentage_non_media = get_month_dist(non_media_tweets, 'created_at')
percentage_non_media

array([0.24913495, 0.11764706, 0.1349481 , 0.43137255, 0.06689735])

Non-media polish tweets

In [8]:
percentage_media = get_month_dist(media_tweets, 'created_at')
percentage_media

array([0.14285714, 0.10714286, 0.07142857, 0.67857143])

The goal is to create labels for 2500 tweets. Although media tweets make up about 3% of the whole dataset, it's been decided that they'll make 10% of the training dataset.

In [9]:
def sample_from_df(df: pd.DataFrame, dist, month_start: int) -> pd.DataFrame:
  new_df = pd.DataFrame()
  month_end = min(12, month_start + len(dist) - 1)
  for month in range(month_start, month_end + 1):
    sample = df[df['created_at'].dt.month ==
                month].sample(n=int(dist[month-month_start]))
    new_df = pd.concat([new_df, sample])
    
  return new_df


In [10]:
SAMPLE_SIZE = 2500
START_MONTH = 8

media_sample_df = sample_from_df(media_tweets, (percentage_media * 0.1 * SAMPLE_SIZE).round(), START_MONTH)
non_media_sample_df = sample_from_df(non_media_tweets, (percentage_non_media * 0.9 * SAMPLE_SIZE).round(), START_MONTH)

Sample and save data to file

In [11]:
df_to_annotate = pd.concat([media_sample_df, non_media_sample_df])
df_to_annotate['target'] = 'NA'
df_to_annotate.to_csv('../../data/sample_to_annotate.tsv', sep='\t', index=False, columns=['id', 'created_at', 'full_text', 'target'])