In [1]:
import pandas as pd

df_train = pd.read_csv('../../data/tweets_train.tsv', sep='\t', converters={'target': str, 'id_str': str})
df_tweets = pd.read_json('../../data/tweets.jsonl', lines=True, dtype=False)

In [4]:
df_train = df_train.rename(columns={'id':'id_str'})
df_train['id_str'] = df_train['id_str'].astype(str)

### Filter out training tweets

In [5]:
train_ids = list(df_train["id_str"].values)
df_tweets = df_tweets[~df_tweets["id_str"].isin(train_ids)]

### Clean tweets from *@user* and *urls*  

User tags and urls are removed from tweets as they do not affect the semantics of the tweets.  
Hashtags are keeped in tweets. 

In [6]:
import re

def clean_tweet(tweet: str) -> str:
    """Removes usernames, links and additional whitespaces"""
    wo_usernames = re.sub('@[^\s]+', '', tweet)
    wo_links = re.sub('http[^\s]+','', wo_usernames)
    wo_whitespaces = ' '.join(wo_links.split())
    return wo_whitespaces

In [7]:
df_tweets["full_text"] = df_tweets["full_text"].apply(clean_tweet)

In [8]:
df_tweets["full_text"].str.len().describe()

count    90169.000000
mean       155.070102
std         78.606502
min          0.000000
25%         87.000000
50%        153.000000
75%        227.000000
max        336.000000
Name: full_text, dtype: float64

### Embedd tweets with LaBSE model

In [41]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/LaBSE")

In [45]:
df_tweets["embedding"] = df_tweets["full_text"].apply(model.encode)

In [None]:
df_tweets.to_csv('../../data/embedded_tweets.tsv', sep='\t')

In [None]:
df_tweets.to_json('../../data/embedded_tweets.jl', lines=True, orient='records')