In [2]:
import pandas as pd
import datetime
import numpy as np

# Loading datasets

In the first step all tweets and users will be loaded and merged.

In [7]:
tweets = pd.read_json('../../data/tweets.jsonl', lines=True, dtype=False)
users = pd.read_json('../../data/users.jsonl', lines=True, dtype=False)
annoted_data = pd.read_csv('../../data/tweets_train.tsv', sep='\t', converters={'id_str': str})

tweets['id'] = tweets['id_str']
tweets = tweets.merge(users[['id_str', 'screen_name']], how='left', right_on='id_str', left_on='user_id_str').drop(columns=['id_str_x', 'id_str_y'])
tweets = tweets.drop_duplicates(subset=['id'])

tweets['full_text'] = tweets['full_text'].str.replace('\n', ' ')

In [11]:
tweets = tweets[~tweets.id.isin(annoted_data.id_str)]
tweets = tweets[tweets.in_reply_to_screen_name.isnull()]

In [17]:
tweets_pl = tweets[tweets.lang == 'pl']
tweets_pl = tweets_pl[tweets_pl.created_at > datetime.datetime(2021, 8, 1)]
tweets_pl = tweets_pl[~tweets_pl.full_text.str.contains('wieszwięcej')]


### Media tweets

Below we calculate how many tweets in the dataset are published by the most popular media websites in Poland.

In [18]:
media = ["tvn24","FaktyTVN","tvp_info","PolsatNewsPL","wPolityce_pl","OnetWiadomosci","wirtualnapolska","gazeta_wyborcza","se_pl","niezaleznapl","gazetapl_news","rzeczpospolita","RMF24pl","RadioZET_NEWS","NewsweekPolska","natematpl","gazetaprawnapl"]

media_tweets = tweets_pl[tweets_pl.screen_name.isin(media)]
non_media_tweets = tweets_pl[~tweets_pl.screen_name.isin(media)]
media_tweets.shape[0]/tweets_pl.shape[0]

0.03985602094240838

### Tweets' distribution over months

In [19]:
def get_month_dist(df: pd.DataFrame, col: str) -> np.array:
  month_df = df[[col]].groupby(df[col].dt.month).agg(count=(col, 'count')).reset_index()
  month_count = month_df['count'].to_numpy()
  percentage = month_count / month_count.sum()
  return percentage

In [20]:
percentage = get_month_dist(tweets_pl, 'created_at')
percentage

array([0.12385471, 0.09803665, 0.15818063, 0.50788613, 0.11204188])

In [21]:
percentage_non_media = get_month_dist(non_media_tweets, 'created_at')
percentage_non_media

array([0.12252062, 0.09607389, 0.15844182, 0.50858837, 0.1143753 ])

In [22]:
percentage_media = get_month_dist(media_tweets, 'created_at')
percentage_media

array([0.15599343, 0.1453202 , 0.15188834, 0.4909688 , 0.05582923])

The goal is to create labels for 2500 tweets. Although media tweets make about 3% of the whole dataset, it's been decided that they'll make 10% of the training dataset.

In [23]:
def sample_from_df(df: pd.DataFrame, dist, month_start: int) -> pd.DataFrame:
  new_df = pd.DataFrame()
  month_end = min(12, month_start + len(dist) - 1)
  for month in range(month_start, month_end + 1):
    sample = df[df['created_at'].dt.month ==
                month].sample(n=int(dist[month-month_start]))
    new_df = pd.concat([new_df, sample])
  return new_df


In [24]:
media_sample_df = sample_from_df(media_tweets, (percentage_media * 250).round(), 8)
non_media_sample_df = sample_from_df(non_media_tweets, (percentage_non_media * 2250).round(), 8)

In [27]:
train_df = pd.concat([media_sample_df, non_media_sample_df])
train_df['target'] = 'NA'

In [34]:
train_df = train_df.sample(frac=1)
train_df.to_csv('../../data/annotate_no_reply.tsv', sep='\t', index=False, columns=['id', 'created_at', 'full_text', 'target'])