In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
import re
import string
import zlib
import numpy as np
import emoji

from collections import Counter
from scipy.stats import entropy

from nltk import sent_tokenize, word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat
from textblob import TextBlob

analyzer = SentimentIntensityAnalyzer()

DIGIT_PATTERN = r"\d"
SPECIAL_CHAR_PATTERN = r"[^A-Za-z0-9 ]"
USER_PATTERN = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
URL_PATTERN = r"(https?://[^\s]+|www\.[^\s]+)"
HASHTAG_PATTERN = r"#\w+"
CASHTAG_PATTERN = r"\$\w+"
EMAIL_PATTERN = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"

FUNCTION_WORDS = {
  "the", "a", "an", "and", "or", "but", "if", "while", "with", "to", "of", "in", "on",
  "for", "from", "by", "is", "are", "was", "were", "be", "been", "being"
}

def normalize_entities(text):
  if not isinstance(text, str) or text.strip() == "":
    return text
  
  text = re.sub(EMAIL_PATTERN, "<EMAIL>", text)
  text = re.sub(URL_PATTERN, "<URL>", text)
  text = re.sub(USER_PATTERN, "<USER>", text)

  return text

def extract_text_features(text, is_tweet=False):
  if not isinstance(text, str) or text.strip() == "":
    return {
        "is_present": False,
        "length": None,
        "num_words": None,
        "num_sentences": None,
        "avg_sentence_length": None,
        "avg_word_length": None,
        "std_word_length": None,
        "unique_word_ratio": None,
        "guiraud_index": None,
        "repetition_ratio": None,
        "hapax_ratio": None,
        "digit_ratio": None,
        "uppercase_ratio": None,
        "lowercase_ratio": None,
        "special_char_ratio": None,
        "punctuation_ratio": None,
        "whitespace_ratio": None,
        "emoji_count": None,
        "emoji_ratio": None,
        "mention_count": None,
        "contains_mention": False,
        "url_count": None,
        "contains_url": False,
        "hashtag_count": None,
        "cashtag_count": None,
        "email_count": None,
        "contains_bot_word_or_hashtag": False,
        "contains_ai_hashtag": False,
        "sentiment": None,
        "sentiment_abs": None,
        "sentiment_neutrality": None,
        "sentiment_subjectivity": None,
        "flesch_reading_ease": None,
        "flesch_kincaid_grade": None,
        "avg_syllables_per_word": None,
        "polysyllabic_word_ratio": None,
        "char_entropy": None,
        "word_entropy": None,
        "compression_ratio": None,
        "starts_with_emoji": False,
        "ends_with_emoji": False,
        "starts_with_url": False,
        "ends_with_url": False,
        "contains_pipe_or_bullet": False,
        "contains_call_to_action": False,
        "contains_ai_phrase": False,
        "function_word_ratio": None,
        "noun_ratio": None,
        "verb_ratio": None,
        "pronoun_ratio": None,
        "adjective_ratio": None,
        "contains_repeated_chars": False,
        "is_retweet": False,
        "is_quote": False
    }

  text = text.strip()
  text = normalize_entities(text)
  char_len = len(text)

  words = word_tokenize(text)
  words_lower = [w.lower() for w in words if w.isalpha()]
  num_words = len(words_lower)

  sentences = sent_tokenize(text)
  num_sentences = len(sentences)

  # --- A: Presence & length
  avg_sentence_length = num_words / num_sentences if num_sentences else None

  # --- B: Lexical structure
  word_lengths = [len(w) for w in words_lower]
  avg_word_length = np.mean(word_lengths) if word_lengths else None
  std_word_length = np.std(word_lengths) if word_lengths else None

  word_counts = Counter(words_lower)
  unique_word_ratio = len(word_counts) / num_words if num_words else None
  guiraud_index = len(word_counts) / np.sqrt(num_words) if num_words else None
  repetition_ratio = 1 - unique_word_ratio if unique_word_ratio is not None else None
  hapax_ratio = sum(1 for w in word_counts if word_counts[w] == 1) / num_words if num_words else None

  # --- C: Character composition
  digits = len(re.findall(DIGIT_PATTERN, text))
  letters = re.findall(r"[A-Za-z]", text)
  uppercase = sum(1 for c in letters if c.isupper())
  lowercase = sum(1 for c in letters if c.islower())
  special_chars = len(re.findall(SPECIAL_CHAR_PATTERN, text))
  punctuation = sum(1 for c in text if c in string.punctuation)
  whitespaces = text.count(" ")

  emoji_count = sum(1 for c in text if c in emoji.EMOJI_DATA)

  digit_ratio = digits / char_len
  uppercase_ratio = uppercase / len(letters) if letters else None
  lowercase_ratio = lowercase / len(letters) if letters else None
  special_char_ratio = special_chars / char_len
  punctuation_ratio = punctuation / char_len
  whitespace_ratio = whitespaces / char_len
  emoji_ratio = emoji_count / char_len

  # --- D: Token usage
  mention_count = text.count("<USER>")
  url_count = text.count("<URL>")
  hashtag_count = len(re.findall(HASHTAG_PATTERN, text))
  cashtag_count = len(re.findall(CASHTAG_PATTERN, text))
  email_count = text.count("<EMAIL>")

  # --- E: Semantic signals
  contains_bot_word_or_hashtag = bool(re.search(r"(?i)(\bbot\b|#\w*bot\b)", text))
  contains_ai_hashtag = bool(re.search(r"(?i)\b#ai\b|#\w+ai\b", text))

  sentiment = analyzer.polarity_scores(text)["compound"]
  sentiment_abs = abs(sentiment)
  sentiment_neutrality = 1 - sentiment_abs
  
  blob = TextBlob(text)
  sentiment_subjectivity = blob.sentiment.subjectivity

  # --- F: Readability
  flesch_reading_ease = textstat.flesch_reading_ease(text)
  flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
  avg_syllables_per_word = textstat.avg_syllables_per_word(text)
  polysyllabic_word_ratio = textstat.polysyllabcount(text) / num_words if num_words else None

  # --- G: Entropy & compression
  char_entropy = entropy(list(Counter(text).values()), base=2)
  word_entropy = entropy(list(word_counts.values()), base=2) if word_counts else None
  avg_word_repetition = np.mean(list(word_counts.values())) if word_counts else None
  compression_ratio = len(zlib.compress(text.encode("utf-8"))) / char_len

  # --- H: Template indicators
  starts_with_emoji = text[0] in emoji.EMOJI_DATA
  ends_with_emoji = text[-1] in emoji.EMOJI_DATA
  starts_with_url = text.startswith("<URL>")
  ends_with_url = text.endswith("<URL>")
  contains_pipe_or_bullet = bool(re.search(r"\s[|‚Ä¢]\s", text))
  contains_call_to_action = bool(re.search(r"(?i)\b(follow|dm|click|join|subscribe|contact|call|buy|giveaway|free|win|retweet|apply)\b", text))
  contains_ai_phrase = bool(re.search(r"(?i)\b(powered by AI|autogenerated|generated by AI|AI assistant)\b", text))

  # --- I: Grammatical composition (self-reference & POS)
  function_word_ratio = sum(w in FUNCTION_WORDS for w in words_lower) / num_words if num_words else None

  pos_tags = pos_tag(words_lower)
  pos_counts = Counter(tag for _, tag in pos_tags)

  noun_ratio = sum(pos_counts[t] for t in ["NN", "NNS", "NNP", "NNPS"]) / num_words if num_words else None
  verb_ratio = sum(pos_counts[t] for t in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]) / num_words if num_words else None
  pronoun_ratio = sum(pos_counts[t] for t in ["PRP", "PRP$"]) / num_words if num_words else None
  adjective_ratio = sum(pos_counts[t] for t in ["JJ", "JJR", "JJS"]) / num_words if num_words else None
 
  # --- J: Noise & stylistic irregularities
  contains_repeated_chars = bool(re.search(r'(.)\1{2,}', text))

  # --- K: Platform-specific discourse markers (tweets only)
  if is_tweet:
    is_retweet = bool(re.match(r'^RT\s+<USER>', text))
    is_quote = bool(re.match(r'^(QT|‚Äú|")', text))
  else:
    is_retweet = None
    is_quote = None

  return {
      "is_present": True,
      "length": char_len,
      "num_words": num_words,
      "num_sentences": num_sentences,
      "avg_sentence_length": avg_sentence_length,
      "avg_word_length": avg_word_length,
      "std_word_length": std_word_length,
      "unique_word_ratio": unique_word_ratio,
      "guiraud_index": guiraud_index,
      "repetition_ratio": repetition_ratio,
      "hapax_ratio": hapax_ratio,
      "digit_ratio": digit_ratio,
      "uppercase_ratio": uppercase_ratio,
      "lowercase_ratio": lowercase_ratio,
      "special_char_ratio": special_char_ratio,
      "punctuation_ratio": punctuation_ratio,
      "whitespace_ratio": whitespace_ratio,
      "emoji_count": emoji_count,
      "emoji_ratio": emoji_ratio,
      "mention_count": mention_count,
      "contains_mention": mention_count > 0,
      "url_count": url_count,
      "contains_url": url_count > 0,
      "hashtag_count": hashtag_count,
      "cashtag_count": cashtag_count,
      "email_count": email_count,
      "contains_bot_word_or_hashtag": contains_bot_word_or_hashtag,
      "contains_ai_hashtag": contains_ai_hashtag,
      "sentiment": sentiment,
      "sentiment_abs": sentiment_abs,
      "sentiment_neutrality": sentiment_neutrality,
      "sentiment_subjectivity": sentiment_subjectivity,
      "flesch_reading_ease": flesch_reading_ease,
      "flesch_kincaid_grade": flesch_kincaid_grade,
      "avg_syllables_per_word": avg_syllables_per_word,
      "polysyllabic_word_ratio": polysyllabic_word_ratio,
      "char_entropy": char_entropy,
      "word_entropy": word_entropy,
      "avg_word_repetition": avg_word_repetition,
      "compression_ratio": compression_ratio,
      "starts_with_emoji": starts_with_emoji,
      "ends_with_emoji": ends_with_emoji,
      "starts_with_url": starts_with_url,
      "ends_with_url": ends_with_url,
      "contains_pipe_or_bullet": contains_pipe_or_bullet,
      "contains_call_to_action": contains_call_to_action,
      "contains_ai_phrase": contains_ai_phrase,
      "function_word_ratio": function_word_ratio,
      "noun_ratio": noun_ratio,
      "verb_ratio": verb_ratio,
      "pronoun_ratio": pronoun_ratio,
      "adjective_ratio": adjective_ratio,
      "contains_repeated_chars": contains_repeated_chars,
      "is_retweet": is_retweet,
      "is_quote": is_quote
  }

In [4]:
import re
from emoji import demojize
from nltk.tokenize import TweetTokenizer

USER_PATTERN = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
URL_PATTERN = r"(https?://[^\s]+|www\.[^\s]+)"
EMAIL_PATTERN = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"

tokenizer = TweetTokenizer()

def normalize_entities(text):
  if not isinstance(text, str) or text.strip() == "":
    return text
  
  text = re.sub(EMAIL_PATTERN, "<EMAIL>", text)
  text = re.sub(URL_PATTERN, "<URL>", text)
  text = re.sub(USER_PATTERN, "<USER>", text)

  return text

def normalize_token(token):
    token = token.replace("‚Äô", "'").replace("‚Ä¶", "...")

    if token == "<USER>":
        return "@USER"
    if token == "<URL>":
        return "HTTPURL"
    if len(token) == 1:
        return demojize(token)

    return token

def normalize_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    text = text.strip()
    text = normalize_entities(text)

    tokens = tokenizer.tokenize(text)
    norm_tweet = " ".join(normalize_token(t) for t in tokens)

    # contractions
    norm_tweet = (
        norm_tweet.replace("cannot ", "can not ")
                  .replace(" n't ", " n't ")
                  .replace("ca n't", "can't")
                  .replace("ai n't", "ain't")
    )

    # verb contractions
    norm_tweet = (
        norm_tweet.replace(" 'm ", " 'm ")
                  .replace(" 're ", " 're ")
                  .replace(" 's ", " 's ")
                  .replace(" 'll ", " 'll ")
                  .replace(" 'd ", " 'd ")
                  .replace(" 've ", " 've ")
    )

    # time expressions
    norm_tweet = (
        norm_tweet.replace(" p . m .", " p.m.")
                  .replace(" p . m ", " p.m ")
                  .replace(" a . m .", " a.m.")
                  .replace(" a . m ", " a.m ")
    )

    return " ".join(norm_tweet.split())

In [5]:
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SentenceEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="vinai/bertweet-base"):
      self.model_name = model_name
      self.model = AutoModel.from_pretrained(self.model_name)
      self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
      self.model.to(device)
      self.model.eval()

    def fit(self, X, y=None):
      return self

    def transform(self, X):
      if isinstance(X, pd.Series):
        X = X.values

      out = np.empty((len(X), 1), dtype=object)
      batch_size = 1024

      loader = DataLoader(
          X,
          batch_size=batch_size,
          shuffle=False
          )
      embeddings = []

      with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding text"):
          inputs = self.tokenizer(
              batch,
              padding=True,
              truncation=True,
              max_length=64,
              return_tensors="pt"
              )
          inputs = {k: v.to(device) for k, v in inputs.items()}  # move tensors to GPU

          outputs = self.model(**inputs)
          batch_embeddings = outputs.last_hidden_state[:, 0, :]  # (n_samples, 768)
          embeddings.append(batch_embeddings.cpu())

      embeddings = torch.cat(embeddings, dim=0)  # shape: (N, 768)
      out[:, 0] = list(embeddings.numpy())  # each row is a 768-D array

      torch.cuda.empty_cache()
      return out

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
user_features = pd.read_parquet(f"../../02_data/user_features.parquet", engine='pyarrow')
user_features.head(2)

Unnamed: 0,id,name_length,username_length,username_name_length_ratio,description,description_length,has_name,has_username,has_description,has_url,has_location,has_pinned_tweet,has_bot_word_in_name,has_bot_word_in_description,ratio_digits_in_name,ratio_digits_in_username,ratio_digits_in_description,ratio_special_chars_in_name,ratio_special_chars_in_username,ratio_special_chars_in_description,name_upper_to_lower_ratio,username_upper_to_lower_ratio,name_entropy,username_entropy,username_name_levenshtein,description_sentiment,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,is_protected,is_verified,created_at,account_age_seconds,followers_count,following_count,listed_count,tweet_count,followers_over_following,double_followers_over_following,following_over_followers,following_over_followers_squared,following_over_total_connections,listed_over_followers,tweets_over_followers,listed_over_tweets,follower_rate,following_rate,listed_rate,tweet_rate,label
0,u1000115670657318912,4,6,1.5,"Open source tool for data & models versioning for ML projects. Join our stellar community https://t.co/vBp8rcV4bf for help, support and insights.",145,True,True,True,False,True,True,False,False,0.0,0.0,0.013793,0.25,0.0,0.062069,3.0,1.0,2.0,2.584963,0.666667,0.765,0,0,0,1,False,False,2018-05-25 20:45:31+00:00,241981329,3488,325,79,911,10.732308,21.464615,0.093177,2.7e-05,0.085235,0.022649,0.261181,0.086718,1.4e-05,1e-06,3.264715e-07,4e-06,human
1,u1000483839800627200,12,11,0.916667,"Theoretical biologist and advisor to data scientists. I have published in evolution, biochemistry, infectious disease, economics, education. Opinions my own.",157,True,True,True,True,True,True,False,False,0.0,0.0,0.0,0.0,0.0,0.044586,0.222222,0.222222,3.022055,2.845351,0.083333,0.0,0,0,0,0,False,False,2018-05-26 21:08:30+00:00,241893550,2563,458,60,2002,5.59607,11.19214,0.178697,7e-05,0.151605,0.02341,0.781116,0.02997,1.1e-05,2e-06,2.48043e-07,8e-06,human


In [7]:
n_rows, n_columns = user_features.shape
print(f"The dataset contains {n_rows} rows and {n_columns} columns.")

The dataset contains 99967 rows and 51 columns.


In [8]:
columns_to_drop = ["description_length", "has_description",
                   "has_bot_word_in_description", "ratio_digits_in_description",
                   "ratio_special_chars_in_description", "description_sentiment",
                   "cashtag_in_description_count", "hashtag_in_description_count",
                   "mention_in_description_count", "url_in_description_count"]

user_features = user_features.drop(columns=columns_to_drop)

user_features['label'] = user_features['label'].map({'human': 0, 'bot': 1})

In [9]:
desc_feature_df = user_features["description"].apply(extract_text_features).apply(pd.Series)
desc_feature_df.rename(columns={c: f"desc_{c}" for c in desc_feature_df.columns}, inplace=True)

user_features_1 = pd.concat([user_features, desc_feature_df], axis=1)
user_features_1.head(2)

Unnamed: 0,id,name_length,username_length,username_name_length_ratio,description,has_name,has_username,has_url,has_location,has_pinned_tweet,has_bot_word_in_name,ratio_digits_in_name,ratio_digits_in_username,ratio_special_chars_in_name,ratio_special_chars_in_username,name_upper_to_lower_ratio,username_upper_to_lower_ratio,name_entropy,username_entropy,username_name_levenshtein,is_protected,is_verified,created_at,account_age_seconds,followers_count,following_count,listed_count,tweet_count,followers_over_following,double_followers_over_following,following_over_followers,following_over_followers_squared,following_over_total_connections,listed_over_followers,tweets_over_followers,listed_over_tweets,follower_rate,following_rate,listed_rate,tweet_rate,label,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,desc_is_retweet,desc_is_quote
0,u1000115670657318912,4,6,1.5,"Open source tool for data & models versioning for ML projects. Join our stellar community https://t.co/vBp8rcV4bf for help, support and insights.",True,True,False,True,True,False,0.0,0.0,0.25,0.0,3.0,1.0,2.0,2.584963,0.666667,False,False,2018-05-25 20:45:31+00:00,241981329,3488,325,79,911,10.732308,21.464615,0.093177,2.7e-05,0.085235,0.022649,0.261181,0.086718,1.4e-05,1e-06,3.264715e-07,4e-06,0,True,127.0,20.0,2.0,10.0,5.05,2.312466,0.9,4.024922,0.1,0.85,0.0,0.069307,0.930693,0.047244,0.047244,0.15748,0.0,0.0,0.0,False,1.0,True,0.0,0.0,0.0,False,False,0.765,0.765,0.235,0.375,57.095,7.78,1.65,0.1,4.407766,4.084184,1.111111,0.889764,False,False,False,False,False,True,False,0.2,0.5,0.1,0.05,0.15,False,,
1,u1000483839800627200,12,11,0.916667,"Theoretical biologist and advisor to data scientists. I have published in evolution, biochemistry, infectious disease, economics, education. Opinions my own.",True,True,True,True,True,False,0.0,0.0,0.0,0.0,0.222222,0.222222,3.022055,2.845351,0.083333,False,False,2018-05-26 21:08:30+00:00,241893550,2563,458,60,2002,5.59607,11.19214,0.178697,7e-05,0.151605,0.02341,0.781116,0.02997,1.1e-05,2e-06,2.48043e-07,8e-06,0,True,157.0,20.0,3.0,6.666667,6.55,3.44202,1.0,4.472136,0.0,1.0,0.0,0.022901,0.977099,0.044586,0.044586,0.121019,0.0,0.0,0.0,False,0.0,False,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.55,-15.661667,17.1,2.55,0.5,4.234203,4.321928,1.0,0.789809,False,False,False,False,False,False,False,0.15,0.45,0.2,0.05,0.15,False,,


In [10]:
user_features_1["description_normalized"] = user_features_1["description"].apply(normalize_text)

embedder = SentenceEmbedder()
user_features_1["desc_embedding"] = embedder.transform(user_features_1["description_normalized"])[:, 0]

Embedding text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [1:25:04<00:00, 52.09s/it]


In [11]:
import joblib

joblib.dump(user_features_1, f"../../02_data/user_features_1.joblib")

['../../02_data/user_features_1.joblib']

In [6]:
tweet_features = pd.read_parquet(f"../../02_data/tweet_features.parquet", engine='pyarrow')
tweet_features.head(2)

Unnamed: 0,author_id,id,text,created_at,in_reply_to_user_id,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label
0,u1001495628738957312,t1502310945158275074,"Join us for a special screening of the documentary #SAPELO and a Q&amp;A with the filmmakers on Thursday, March 31 at the @CarterCenter!üìΩÔ∏èüá®üá≠ @CarterLibrary @SWISS_FILMS https://t.co/53nsRtRI8u",2022-03-11 15:50:15+00:00,,0,0,1,0.0,0.0,1,human
1,u1002590470097154048,t1459274835377500161,Looking forward to meeting the final chapterüëÄüëÄ https://t.co/6NsqPBe272,2021-11-12 21:40:07+00:00,,0,0,0,,,0,bot


In [7]:
n_rows, n_columns = tweet_features.shape
print(f"The dataset contains {n_rows} rows and {n_columns} columns.")

The dataset contains 1048873 rows and 12 columns.


In [8]:
tweet_features['label'] = tweet_features['label'].map({'human': 0, 'bot': 1})

In [9]:
tweet_feature_df = tweet_features["text"].apply(lambda x: extract_text_features(x, is_tweet=True)).apply(pd.Series)
#tweet_feature_df.rename(columns={c: f"tweet_{c}" for c in tweet_feature_df.columns}, inplace=True)

tweet_features_1 = pd.concat([tweet_features, tweet_feature_df], axis=1)
tweet_features_1.head()

Unnamed: 0,author_id,id,text,created_at,in_reply_to_user_id,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label,is_present,length,num_words,num_sentences,avg_sentence_length,avg_word_length,std_word_length,unique_word_ratio,guiraud_index,repetition_ratio,hapax_ratio,digit_ratio,uppercase_ratio,lowercase_ratio,special_char_ratio,punctuation_ratio,whitespace_ratio,emoji_count,emoji_ratio,mention_count,contains_mention,url_count,contains_url,hashtag_count,cashtag_count,email_count,contains_bot_word_or_hashtag,contains_ai_hashtag,sentiment,sentiment_abs,sentiment_neutrality,sentiment_subjectivity,flesch_reading_ease,flesch_kincaid_grade,avg_syllables_per_word,polysyllabic_word_ratio,char_entropy,word_entropy,avg_word_repetition,compression_ratio,starts_with_emoji,ends_with_emoji,starts_with_url,ends_with_url,contains_pipe_or_bullet,contains_call_to_action,contains_ai_phrase,function_word_ratio,noun_ratio,verb_ratio,pronoun_ratio,adjective_ratio,contains_repeated_chars,is_retweet,is_quote
0,u1001495628738957312,t1502310945158275074,"Join us for a special screening of the documentary #SAPELO and a Q&amp;A with the filmmakers on Thursday, March 31 at the @CarterCenter!üìΩÔ∏èüá®üá≠ @CarterLibrary @SWISS_FILMS https://t.co/53nsRtRI8u",2022-03-11 15:50:15+00:00,,0,0,1,0.0,0.0,1,0,True,153,27,1,27.0,4.037037,2.71459,0.777778,4.041452,0.222222,0.666667,0.013072,0.238532,0.761468,0.111111,0.084967,0.163399,1,0.006536,3,True,1,True,1,0,0,False,False,0.636,0.636,0.364,0.285714,63.486154,7.633846,1.538462,0.074074,4.914588,4.226567,1.285714,0.986928,False,False,False,True,False,True,False,0.407407,0.37037,0.037037,0.037037,0.111111,False,False,False
1,u1002590470097154048,t1459274835377500161,Looking forward to meeting the final chapterüëÄüëÄ https://t.co/6NsqPBe272,2021-11-12 21:40:07+00:00,,0,0,0,,,0,1,True,52,7,1,7.0,4.857143,2.030381,1.0,2.645751,0.0,1.0,0.0,0.097561,0.902439,0.076923,0.038462,0.134615,2,0.038462,0,False,1,True,0,0,0,False,False,0.0,0.0,1.0,1.0,61.24,6.705,1.625,0.0,4.30292,2.807355,1.0,1.173077,False,False,False,True,False,False,False,0.285714,0.142857,0.285714,0.0,0.142857,False,False,False
2,u1002590470097154048,t1405835036847443969,"RT @HamidrezaKasaei: Our new service robot is getting ready to be used in amazing researches on ""generalizable perception and manipulation‚Ä¶",2021-06-18 10:29:26+00:00,,0,0,0,,,1,1,True,129,19,1,19.0,4.894737,3.193657,1.0,4.358899,0.0,1.0,0.0,0.066667,0.933333,0.03876,0.031008,0.147287,0,0.0,1,True,0,False,0,0,0,False,False,0.743,0.743,0.257,0.618182,17.335,15.81,2.0,0.263158,4.385255,4.247928,1.0,0.899225,False,False,False,False,False,False,False,0.315789,0.263158,0.210526,0.052632,0.210526,False,True,False
3,u1002590470097154048,t1401642372044296199,"RT @yao_weijia: I'm happy to be a finalist for the ICRA Best Paper Award &amp; grateful to coauthors @HectorGdeMarina, Zhiyong, @MingCao10. I t‚Ä¶",2021-06-06 20:49:17+00:00,,0,0,0,,,3,1,True,125,22,2,11.0,3.954545,2.285871,0.818182,3.837613,0.181818,0.681818,0.0,0.269663,0.730337,0.112,0.104,0.176,0,0.0,3,True,0,False,0,0,0,False,False,0.9371,0.9371,0.0629,0.65,54.75087,11.336522,1.521739,0.090909,4.795863,4.061482,1.222222,0.976,False,False,False,False,False,False,False,0.272727,0.454545,0.090909,0.0,0.136364,False,True,False
4,u1002590470097154048,t1379337484384014336,"RT @corl_conf: The Conference on #Robot #Learning 2021 will be held on Nov 8-11 in London, UK &amp; virtually. Exciting new changes, including:‚Ä¶",2021-04-06 07:37:37+00:00,,0,0,0,,,44,1,True,140,21,2,10.5,4.666667,2.678545,0.952381,4.364358,0.047619,0.904762,0.05,0.153061,0.846939,0.092857,0.085714,0.157143,0,0.0,1,True,0,False,2,0,0,True,False,0.4939,0.4939,0.5061,0.627273,51.710326,8.903696,1.695652,0.238095,4.84372,4.297079,1.05,0.985714,False,False,False,False,False,False,False,0.238095,0.333333,0.238095,0.0,0.142857,False,True,False


In [10]:
import joblib

joblib.dump(tweet_features_1, f"../../02_data/tweet_features_1.joblib")

['../../02_data/tweet_features_1.joblib']

In [8]:
import joblib

tweet_features_2 = joblib.load(f"../../02_data/tweet_features_1.joblib")
tweet_features_2.head(2)

Unnamed: 0,author_id,id,text,created_at,in_reply_to_user_id,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label,is_present,length,num_words,num_sentences,avg_sentence_length,avg_word_length,std_word_length,unique_word_ratio,guiraud_index,repetition_ratio,hapax_ratio,digit_ratio,uppercase_ratio,lowercase_ratio,special_char_ratio,punctuation_ratio,whitespace_ratio,emoji_count,emoji_ratio,mention_count,contains_mention,url_count,contains_url,hashtag_count,cashtag_count,email_count,contains_bot_word_or_hashtag,contains_ai_hashtag,sentiment,sentiment_abs,sentiment_neutrality,sentiment_subjectivity,flesch_reading_ease,flesch_kincaid_grade,avg_syllables_per_word,polysyllabic_word_ratio,char_entropy,word_entropy,avg_word_repetition,compression_ratio,starts_with_emoji,ends_with_emoji,starts_with_url,ends_with_url,contains_pipe_or_bullet,contains_call_to_action,contains_ai_phrase,function_word_ratio,noun_ratio,verb_ratio,pronoun_ratio,adjective_ratio,contains_repeated_chars,is_retweet,is_quote
0,u1001495628738957312,t1502310945158275074,"Join us for a special screening of the documentary #SAPELO and a Q&amp;A with the filmmakers on Thursday, March 31 at the @CarterCenter!üìΩÔ∏èüá®üá≠ @CarterLibrary @SWISS_FILMS https://t.co/53nsRtRI8u",2022-03-11 15:50:15+00:00,,0,0,1,0.0,0.0,1,0,True,153,27,1,27.0,4.037037,2.71459,0.777778,4.041452,0.222222,0.666667,0.013072,0.238532,0.761468,0.111111,0.084967,0.163399,1,0.006536,3,True,1,True,1,0,0,False,False,0.636,0.636,0.364,0.285714,63.486154,7.633846,1.538462,0.074074,4.914588,4.226567,1.285714,0.986928,False,False,False,True,False,True,False,0.407407,0.37037,0.037037,0.037037,0.111111,False,False,False
1,u1002590470097154048,t1459274835377500161,Looking forward to meeting the final chapterüëÄüëÄ https://t.co/6NsqPBe272,2021-11-12 21:40:07+00:00,,0,0,0,,,0,1,True,52,7,1,7.0,4.857143,2.030381,1.0,2.645751,0.0,1.0,0.0,0.097561,0.902439,0.076923,0.038462,0.134615,2,0.038462,0,False,1,True,0,0,0,False,False,0.0,0.0,1.0,1.0,61.24,6.705,1.625,0.0,4.30292,2.807355,1.0,1.173077,False,False,False,True,False,False,False,0.285714,0.142857,0.285714,0.0,0.142857,False,False,False


In [None]:
tweet_features_2["text_normalized"] = tweet_features_2["text"].apply(normalize_text)

embedder = SentenceEmbedder()
tweet_features_2["embedding"] = embedder.transform(tweet_features_2["text_normalized"])[:, 0]

In [None]:
import joblib

joblib.dump(tweet_features_2, f"../../02_data/tweet_features_2.joblib")

['../../02_data/tweet_features_1.joblib']