In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

def plot_numeric(df, feature, label_col="label"):
    weights = df.groupby(label_col)[feature].transform(lambda x: 1 / len(x))

    plt.figure(figsize=(8, 5))
    sns.histplot(
        data=df,
        x=feature,
        hue=label_col,
        bins=50,
        weights=weights,
        alpha=0.6
    )
    plt.title(f"{feature} by {label_col} (normalized)")
    plt.ylabel("Relative frequency")
    plt.show()


def plot_boolean(df, feature, label_col="label"):
    prop_df = (
        df
        .groupby(label_col)[feature]
        .value_counts(normalize=True)
        .rename("proportion")
        .reset_index()
    )

    plt.figure(figsize=(6, 4))
    sns.barplot(
        data=prop_df,
        x=feature,
        y="proportion",
        hue=label_col
    )
    plt.title(f"{feature} by {label_col}")
    plt.ylabel("Proportion")
    plt.show()

In [20]:
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SentenceEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="vinai/bertweet-base"):
      self.model_name = model_name
      self.model = AutoModel.from_pretrained(self.model_name)
      self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
      self.model.to(device)
      self.model.eval()

    def fit(self, X, y=None):
      return self

    def transform(self, X):
      if isinstance(X, pd.Series):
        X = X.values

      out = np.empty((len(X), 1), dtype=object)
      batch_size = 1024

      loader = DataLoader(
          X,
          batch_size=batch_size,
          shuffle=False
          )
      embeddings = []

      with torch.no_grad():
        for batch in tqdm(loader, desc="Encoding bios"):
          inputs = self.tokenizer(
              batch,
              padding=True,
              truncation=True,
              max_length=64,
              return_tensors="pt"
              )
          inputs = {k: v.to(device) for k, v in inputs.items()}  # move tensors to GPU

          outputs = self.model(**inputs)
          batch_embeddings = outputs.last_hidden_state[:, 0, :]  # (n_samples, 768)
          embeddings.append(batch_embeddings.cpu())

      embeddings = torch.cat(embeddings, dim=0)  # shape: (N, 768)
      out[:, 0] = list(embeddings.numpy())  # each row is a 768-D array

      torch.cuda.empty_cache()
      return out

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import joblib

user_features = joblib.load(f"../../02_data/user_features_1.joblib")
tweet_features = joblib.load(f"../../02_data/tweet_features_1.joblib")

In [4]:
tweet_features.head()

Unnamed: 0,author_id,id,text,created_at,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label,tweet_is_present,tweet_length,tweet_num_words,tweet_num_sentences,tweet_avg_sentence_length,tweet_avg_word_length,tweet_std_word_length,tweet_unique_word_ratio,tweet_guiraud_index,tweet_repetition_ratio,tweet_hapax_ratio,tweet_digit_ratio,tweet_uppercase_ratio,tweet_lowercase_ratio,tweet_special_char_ratio,tweet_punctuation_ratio,tweet_whitespace_ratio,tweet_emoji_count,tweet_emoji_ratio,tweet_mention_count,tweet_contains_mention,tweet_url_count,tweet_contains_url,tweet_hashtag_count,tweet_cashtag_count,tweet_email_count,tweet_contains_bot_word_or_hashtag,tweet_contains_ai_hashtag,tweet_sentiment,tweet_sentiment_abs,tweet_sentiment_neutrality,tweet_sentiment_subjectivity,tweet_flesch_reading_ease,tweet_flesch_kincaid_grade,tweet_avg_syllables_per_word,tweet_polysyllabic_word_ratio,tweet_char_entropy,tweet_word_entropy,tweet_avg_word_repetition,tweet_compression_ratio,tweet_starts_with_emoji,tweet_ends_with_emoji,tweet_starts_with_url,tweet_ends_with_url,tweet_contains_pipe_or_bullet,tweet_contains_call_to_action,tweet_contains_ai_phrase,tweet_function_word_ratio,tweet_noun_ratio,tweet_verb_ratio,tweet_pronoun_ratio,tweet_adjective_ratio,tweet_contains_repeated_chars,tweet_is_retweet,tweet_is_quote
0,u1001495628738957312,t1502310945158275074,"Join us for a special screening of the documentary #SAPELO and a Q&amp;A with the filmmakers on Thursday, March 31 at the @CarterCenter!üìΩÔ∏èüá®üá≠ @CarterLibrary @SWISS_FILMS https://t.co/53nsRtRI8u",2022-03-11 15:50:15+00:00,0,0,1,0.0,0.0,1,0,True,153,27,1,27.0,4.037037,2.71459,0.777778,4.041452,0.222222,0.666667,0.013072,0.238532,0.761468,0.111111,0.084967,0.163399,1,0.006536,3,True,1,True,1,0,0,False,False,0.636,0.636,0.364,0.285714,63.486154,7.633846,1.538462,0.074074,4.914588,4.226567,1.285714,0.986928,False,False,False,True,False,True,False,0.407407,0.37037,0.037037,0.037037,0.111111,False,False,False
1,u1002590470097154048,t1459274835377500161,Looking forward to meeting the final chapterüëÄüëÄ https://t.co/6NsqPBe272,2021-11-12 21:40:07+00:00,0,0,0,,,0,1,True,52,7,1,7.0,4.857143,2.030381,1.0,2.645751,0.0,1.0,0.0,0.097561,0.902439,0.076923,0.038462,0.134615,2,0.038462,0,False,1,True,0,0,0,False,False,0.0,0.0,1.0,1.0,61.24,6.705,1.625,0.0,4.30292,2.807355,1.0,1.173077,False,False,False,True,False,False,False,0.285714,0.142857,0.285714,0.0,0.142857,False,False,False
2,u1002590470097154048,t1405835036847443969,"RT @HamidrezaKasaei: Our new service robot is getting ready to be used in amazing researches on ""generalizable perception and manipulation‚Ä¶",2021-06-18 10:29:26+00:00,0,0,0,,,1,1,True,129,19,1,19.0,4.894737,3.193657,1.0,4.358899,0.0,1.0,0.0,0.066667,0.933333,0.03876,0.031008,0.147287,0,0.0,1,True,0,False,0,0,0,False,False,0.743,0.743,0.257,0.618182,17.335,15.81,2.0,0.263158,4.385255,4.247928,1.0,0.899225,False,False,False,False,False,False,False,0.315789,0.263158,0.210526,0.052632,0.210526,False,True,False
3,u1002590470097154048,t1401642372044296199,"RT @yao_weijia: I'm happy to be a finalist for the ICRA Best Paper Award &amp; grateful to coauthors @HectorGdeMarina, Zhiyong, @MingCao10. I t‚Ä¶",2021-06-06 20:49:17+00:00,0,0,0,,,3,1,True,125,22,2,11.0,3.954545,2.285871,0.818182,3.837613,0.181818,0.681818,0.0,0.269663,0.730337,0.112,0.104,0.176,0,0.0,3,True,0,False,0,0,0,False,False,0.9371,0.9371,0.0629,0.65,54.75087,11.336522,1.521739,0.090909,4.795863,4.061482,1.222222,0.976,False,False,False,False,False,False,False,0.272727,0.454545,0.090909,0.0,0.136364,False,True,False
4,u1002590470097154048,t1379337484384014336,"RT @corl_conf: The Conference on #Robot #Learning 2021 will be held on Nov 8-11 in London, UK &amp; virtually. Exciting new changes, including:‚Ä¶",2021-04-06 07:37:37+00:00,0,0,0,,,44,1,True,140,21,2,10.5,4.666667,2.678545,0.952381,4.364358,0.047619,0.904762,0.05,0.153061,0.846939,0.092857,0.085714,0.157143,0,0.0,1,True,0,False,2,0,0,True,False,0.4939,0.4939,0.5061,0.627273,51.710326,8.903696,1.695652,0.238095,4.84372,4.297079,1.05,0.985714,False,False,False,False,False,False,False,0.238095,0.333333,0.238095,0.0,0.142857,False,True,False


In [5]:
if tweet_features.isnull().any().any():
    print("Missing values found in the dataset.")
    na_summary = tweet_features.isnull().sum().loc[lambda x: x > 0].to_frame(name='Missing Count')
    na_summary['Missing Percentage'] = (na_summary['Missing Count'] / tweet_features.shape[0]) * 100
    print("\nSummary of missing values:")
    print(na_summary)
else:
    print("No missing values found in the dataset.")

Missing values found in the dataset.

Summary of missing values:
                               Missing Count  Missing Percentage
quote_count                           796093           75.899847
reply_count                           796093           75.899847
tweet_avg_word_length                     25            0.002384
tweet_std_word_length                     25            0.002384
tweet_unique_word_ratio                   25            0.002384
tweet_guiraud_index                       25            0.002384
tweet_repetition_ratio                    25            0.002384
tweet_hapax_ratio                         25            0.002384
tweet_uppercase_ratio                      2            0.000191
tweet_lowercase_ratio                      2            0.000191
tweet_polysyllabic_word_ratio             25            0.002384
tweet_word_entropy                        25            0.002384
tweet_avg_word_repetition                 25            0.002384
tweet_function_word_ratio

In [6]:
tweet_features['quote_count_missing'] = tweet_features['quote_count'].isnull().astype(bool)
tweet_features['reply_count_missing'] = tweet_features['reply_count'].isnull().astype(bool)

zero_fill = [
    "quote_count",
    "reply_count",
    "tweet_unique_word_ratio",
    "tweet_repetition_ratio",
    "tweet_hapax_ratio",
    "tweet_uppercase_ratio",
    "tweet_lowercase_ratio",
    "tweet_polysyllabic_word_ratio",
    "tweet_word_entropy",
    "tweet_function_word_ratio",
    "tweet_noun_ratio",
    "tweet_verb_ratio",
    "tweet_pronoun_ratio",
    "tweet_adjective_ratio"
]

median_fill = [
    "tweet_avg_word_length",
    "tweet_std_word_length",
    "tweet_guiraud_index",
    "tweet_avg_word_repetition"
]

tweet_features[zero_fill] = tweet_features[zero_fill].fillna(0)
tweet_features[median_fill] = tweet_features[median_fill].apply(
    lambda x: x.fillna(x.median())
)

In [7]:
if tweet_features.isnull().any().any():
    print("Missing values found in the dataset.")
    na_summary = tweet_features.isnull().sum().loc[lambda x: x > 0].to_frame(name='Missing Count')
    na_summary['Missing Percentage'] = (na_summary['Missing Count'] / tweet_features.shape[0]) * 100
    print("\nSummary of missing values:")
    print(na_summary)
else:
    print("No missing values found in the dataset.")

No missing values found in the dataset.


In [8]:
for col, dtype in tweet_features.dtypes.items():
    print(f"{col}: {dtype}")

author_id: object
id: object
text: object
created_at: object
is_reply: int32
is_sensitive: int32
like_count: int64
quote_count: float64
reply_count: float64
retweet_count: int64
label: int64
tweet_is_present: bool
tweet_length: int64
tweet_num_words: int64
tweet_num_sentences: int64
tweet_avg_sentence_length: float64
tweet_avg_word_length: float64
tweet_std_word_length: float64
tweet_unique_word_ratio: float64
tweet_guiraud_index: float64
tweet_repetition_ratio: float64
tweet_hapax_ratio: float64
tweet_digit_ratio: float64
tweet_uppercase_ratio: float64
tweet_lowercase_ratio: float64
tweet_special_char_ratio: float64
tweet_punctuation_ratio: float64
tweet_whitespace_ratio: float64
tweet_emoji_count: int64
tweet_emoji_ratio: float64
tweet_mention_count: int64
tweet_contains_mention: bool
tweet_url_count: int64
tweet_contains_url: bool
tweet_hashtag_count: int64
tweet_cashtag_count: int64
tweet_email_count: int64
tweet_contains_bot_word_or_hashtag: bool
tweet_contains_ai_hashtag: bool
tw

In [9]:
tweet_features["created_at"] = pd.to_datetime(
    tweet_features["created_at"],
    errors="coerce",
    utc=True
)

bool_cols = ["is_reply", "is_sensitive"]
int_cols = ["quote_count", "reply_count"]

tweet_features[bool_cols] = tweet_features[bool_cols].astype("bool")
tweet_features[int_cols] = tweet_features[int_cols].astype("int64")

In [17]:
import re
from emoji import demojize
from nltk.tokenize import TweetTokenizer

USER_PATTERN = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
URL_PATTERN = r"(https?://[^\s]+|www\.[^\s]+)"
EMAIL_PATTERN = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"

tokenizer = TweetTokenizer()

def normalize_entities(text):
  if not isinstance(text, str) or text.strip() == "":
    return text
  
  text = re.sub(EMAIL_PATTERN, "<EMAIL>", text)
  text = re.sub(URL_PATTERN, "<URL>", text)
  text = re.sub(USER_PATTERN, "<USER>", text)

  return text

def normalize_token(token):
    token = token.replace("‚Äô", "'").replace("‚Ä¶", "...")

    if token == "<USER>":
        return "@USER"
    if token == "<URL>":
        return "HTTPURL"
    if len(token) == 1:
        return demojize(token)

    return token

def normalize_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    text = text.strip()
    text = normalize_entities(text)

    tokens = tokenizer.tokenize(text)
    norm_tweet = " ".join(normalize_token(t) for t in tokens)

    # contractions
    norm_tweet = (
        norm_tweet.replace("cannot ", "can not ")
                  .replace(" n't ", " n't ")
                  .replace("ca n't", "can't")
                  .replace("ai n't", "ain't")
    )

    # verb contractions
    norm_tweet = (
        norm_tweet.replace(" 'm ", " 'm ")
                  .replace(" 're ", " 're ")
                  .replace(" 's ", " 's ")
                  .replace(" 'll ", " 'll ")
                  .replace(" 'd ", " 'd ")
                  .replace(" 've ", " 've ")
    )

    # time expressions
    norm_tweet = (
        norm_tweet.replace(" p . m .", " p.m.")
                  .replace(" p . m ", " p.m ")
                  .replace(" a . m .", " a.m.")
                  .replace(" a . m ", " a.m ")
    )

    return " ".join(norm_tweet.split())

In [21]:
tweet_features["text"] = tweet_features["text"].apply(normalize_text)

In [None]:
embedder = SentenceEmbedder()
tweet_emb = embedder.transform(tweet_features["text"])