In [2]:
import pandas as pd
import numpy as np

import joblib

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_colwidth', None)

In [9]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [10]:
def extract_user_features(user_df):
    
    def count_entities(x):
        if isinstance(x, np.ndarray):
            return len(x)
        return 0
    
    reference_time = pd.Timestamp("2022-01-01", tz="UTC")
    
    out = pd.DataFrame({
        "id": user_df["id"],
        "name": user_df["name"],
        "username": user_df["username"],
        "description": user_df["description"],
        "is_protected": user_df["protected"],
        "is_verified": user_df["verified"]
        })
    
    out["has_url"] = user_df["url"].notna() & (user_df["url"] != "")
    out["has_location"] = user_df["location"].notna() & (user_df["location"] != "")
    out["has_pinned_tweet"] = user_df["pinned_tweet_id"].notna() & (user_df["pinned_tweet_id"] != "")
    out["has_profile_image"] = user_df["profile_image_url"].notna() & (user_df["profile_image_url"] != "")
    
    out["cashtag_in_description_count"] = user_df["cashtags"].apply(count_entities)
    out["hashtag_in_description_count"] = user_df["hashtags"].apply(count_entities)
    out["mention_in_description_count"] = user_df["mentions"].apply(count_entities)
    out["url_in_description_count"] = user_df["urls"].apply(count_entities)
    
    out["has_cashtag_in_description"] = out["cashtag_in_description_count"] > 0
    out["has_hashtag_in_description"] = out["hashtag_in_description_count"] > 0
    out["has_mention_in_description"] = out["mention_in_description_count"] > 0
    out["has_url_in_description"] = out["url_in_description_count"] > 0
    
    account_age_days = (reference_time - pd.to_datetime(user_df["created_at"], utc=True)).dt.total_seconds() / 86400
    out["account_age"] = account_age_days
    
    fc = user_df["followers_count"]
    fg = user_df["following_count"]
    lc = user_df["listed_count"]
    tc = user_df["tweet_count"]
    
    out["follower_rate"] = fc / account_age_days
    out["following_rate"] = fg / account_age_days
    out["listed_rate"] = lc / account_age_days
    out["tweet_rate"] = tc / account_age_days
    
    return out

In [11]:
import re
import string
import zlib
import numpy as np
import emoji

from collections import Counter
from scipy.stats import entropy

from nltk import sent_tokenize, word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat
from textblob import TextBlob

analyzer = SentimentIntensityAnalyzer()

DIGIT_PATTERN = r"\d"
SPECIAL_CHAR_PATTERN = r"[^A-Za-z0-9 ]"
USER_PATTERN = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
URL_PATTERN = r"(https?://[^\s]+|www\.[^\s]+)"
HASHTAG_PATTERN = r"#\w+"
CASHTAG_PATTERN = r"\$\w+"
EMAIL_PATTERN = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"

FUNCTION_WORDS = {
  "the", "a", "an", "and", "or", "but", "if", "while", "with", "to", "of", "in", "on",
  "for", "from", "by", "is", "are", "was", "were", "be", "been", "being"
}

def normalize_entities(text):
  if not isinstance(text, str) or text.strip() == "":
    return text
  
  text = re.sub(EMAIL_PATTERN, "<EMAIL>", text)
  text = re.sub(URL_PATTERN, "<URL>", text)
  text = re.sub(USER_PATTERN, "<USER>", text)

  return text

def extract_text_features(text, is_tweet=False):
  if not isinstance(text, str) or text.strip() == "":
    return {
        "is_present": False,
        "length": None,
        "num_words": None,
        "num_sentences": None,
        "avg_sentence_length": None,
        "avg_word_length": None,
        "std_word_length": None,
        "unique_word_ratio": None,
        "guiraud_index": None,
        "repetition_ratio": None,
        "hapax_ratio": None,
        "digit_ratio": None,
        "uppercase_ratio": None,
        "lowercase_ratio": None,
        "special_char_ratio": None,
        "punctuation_ratio": None,
        "whitespace_ratio": None,
        "emoji_count": None,
        "emoji_ratio": None,
        "mention_count": None,
        "contains_mention": False,
        "url_count": None,
        "contains_url": False,
        "hashtag_count": None,
        "cashtag_count": None,
        "email_count": None,
        "contains_bot_word_or_hashtag": False,
        "contains_ai_hashtag": False,
        "sentiment": None,
        "sentiment_abs": None,
        "sentiment_neutrality": None,
        "sentiment_subjectivity": None,
        "flesch_reading_ease": None,
        "flesch_kincaid_grade": None,
        "avg_syllables_per_word": None,
        "polysyllabic_word_ratio": None,
        "char_entropy": None,
        "word_entropy": None,
        "compression_ratio": None,
        "starts_with_emoji": False,
        "ends_with_emoji": False,
        "starts_with_url": False,
        "ends_with_url": False,
        "contains_pipe_or_bullet": False,
        "contains_call_to_action": False,
        "contains_ai_phrase": False,
        "function_word_ratio": None,
        "noun_ratio": None,
        "verb_ratio": None,
        "pronoun_ratio": None,
        "adjective_ratio": None,
        "contains_repeated_chars": False,
        "is_retweet": False,
        "is_quote": False
    }

  text = text.strip()
  text = normalize_entities(text)
  char_len = len(text)

  words = word_tokenize(text)
  words_lower = [w.lower() for w in words if w.isalpha()]
  num_words = len(words_lower)

  sentences = sent_tokenize(text)
  num_sentences = len(sentences)

  # --- A: Presence & length
  avg_sentence_length = num_words / num_sentences if num_sentences else None

  # --- B: Lexical structure
  word_lengths = [len(w) for w in words_lower]
  avg_word_length = np.mean(word_lengths) if word_lengths else None
  std_word_length = np.std(word_lengths) if word_lengths else None

  word_counts = Counter(words_lower)
  unique_word_ratio = len(word_counts) / num_words if num_words else None
  guiraud_index = len(word_counts) / np.sqrt(num_words) if num_words else None
  repetition_ratio = 1 - unique_word_ratio if unique_word_ratio is not None else None
  hapax_ratio = sum(1 for w in word_counts if word_counts[w] == 1) / num_words if num_words else None

  # --- C: Character composition
  digits = len(re.findall(DIGIT_PATTERN, text))
  letters = re.findall(r"[A-Za-z]", text)
  uppercase = sum(1 for c in letters if c.isupper())
  lowercase = sum(1 for c in letters if c.islower())
  special_chars = len(re.findall(SPECIAL_CHAR_PATTERN, text))
  punctuation = sum(1 for c in text if c in string.punctuation)
  whitespaces = text.count(" ")

  emoji_count = sum(1 for c in text if c in emoji.EMOJI_DATA)

  digit_ratio = digits / char_len
  uppercase_ratio = uppercase / len(letters) if letters else None
  lowercase_ratio = lowercase / len(letters) if letters else None
  special_char_ratio = special_chars / char_len
  punctuation_ratio = punctuation / char_len
  whitespace_ratio = whitespaces / char_len
  emoji_ratio = emoji_count / char_len

  # --- D: Token usage
  mention_count = text.count("<USER>")
  url_count = text.count("<URL>")
  hashtag_count = len(re.findall(HASHTAG_PATTERN, text))
  cashtag_count = len(re.findall(CASHTAG_PATTERN, text))
  email_count = text.count("<EMAIL>")

  # --- E: Semantic signals
  contains_bot_word_or_hashtag = bool(re.search(r"(?i)(\bbot\b|#\w*bot\b)", text))
  contains_ai_hashtag = bool(re.search(r"(?i)\b#ai\b|#\w+ai\b", text))

  sentiment = analyzer.polarity_scores(text)["compound"]
  sentiment_abs = abs(sentiment)
  sentiment_neutrality = 1 - sentiment_abs
  
  blob = TextBlob(text)
  sentiment_subjectivity = blob.sentiment.subjectivity

  # --- F: Readability
  flesch_reading_ease = textstat.flesch_reading_ease(text)
  flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
  avg_syllables_per_word = textstat.avg_syllables_per_word(text)
  polysyllabic_word_ratio = textstat.polysyllabcount(text) / num_words if num_words else None

  # --- G: Entropy & compression
  char_entropy = entropy(list(Counter(text).values()), base=2)
  word_entropy = entropy(list(word_counts.values()), base=2) if word_counts else None
  avg_word_repetition = np.mean(list(word_counts.values())) if word_counts else None
  compression_ratio = len(zlib.compress(text.encode("utf-8"))) / char_len

  # --- H: Template indicators
  starts_with_emoji = text[0] in emoji.EMOJI_DATA
  ends_with_emoji = text[-1] in emoji.EMOJI_DATA
  starts_with_url = text.startswith("<URL>")
  ends_with_url = text.endswith("<URL>")
  contains_pipe_or_bullet = bool(re.search(r"\s[|•]\s", text))
  contains_call_to_action = bool(re.search(r"(?i)\b(follow|dm|click|join|subscribe|contact|call|buy|giveaway|free|win|retweet|apply)\b", text))
  contains_ai_phrase = bool(re.search(r"(?i)\b(powered by AI|autogenerated|generated by AI|AI assistant)\b", text))

  # --- I: Grammatical composition (self-reference & POS)
  function_word_ratio = sum(w in FUNCTION_WORDS for w in words_lower) / num_words if num_words else None

  pos_tags = pos_tag(words_lower)
  pos_counts = Counter(tag for _, tag in pos_tags)

  noun_ratio = sum(pos_counts[t] for t in ["NN", "NNS", "NNP", "NNPS"]) / num_words if num_words else None
  verb_ratio = sum(pos_counts[t] for t in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]) / num_words if num_words else None
  pronoun_ratio = sum(pos_counts[t] for t in ["PRP", "PRP$"]) / num_words if num_words else None
  adjective_ratio = sum(pos_counts[t] for t in ["JJ", "JJR", "JJS"]) / num_words if num_words else None
 
  # --- J: Noise & stylistic irregularities
  contains_repeated_chars = bool(re.search(r'(.)\1{2,}', text))

  # --- K: Platform-specific discourse markers (tweets only)
  if is_tweet:
    is_retweet = bool(re.match(r'^RT\s+<USER>', text))
    is_quote = bool(re.match(r'^(QT|“|")', text))
  else:
    is_retweet = None
    is_quote = None

  return {
      "is_present": True,
      "length": char_len,
      "num_words": num_words,
      "num_sentences": num_sentences,
      "avg_sentence_length": avg_sentence_length,
      "avg_word_length": avg_word_length,
      "std_word_length": std_word_length,
      "unique_word_ratio": unique_word_ratio,
      "guiraud_index": guiraud_index,
      "repetition_ratio": repetition_ratio,
      "hapax_ratio": hapax_ratio,
      "digit_ratio": digit_ratio,
      "uppercase_ratio": uppercase_ratio,
      "lowercase_ratio": lowercase_ratio,
      "special_char_ratio": special_char_ratio,
      "punctuation_ratio": punctuation_ratio,
      "whitespace_ratio": whitespace_ratio,
      "emoji_count": emoji_count,
      "emoji_ratio": emoji_ratio,
      "mention_count": mention_count,
      "contains_mention": mention_count > 0,
      "url_count": url_count,
      "contains_url": url_count > 0,
      "hashtag_count": hashtag_count,
      "cashtag_count": cashtag_count,
      "email_count": email_count,
      "contains_bot_word_or_hashtag": contains_bot_word_or_hashtag,
      "contains_ai_hashtag": contains_ai_hashtag,
      "sentiment": sentiment,
      "sentiment_abs": sentiment_abs,
      "sentiment_neutrality": sentiment_neutrality,
      "sentiment_subjectivity": sentiment_subjectivity,
      "flesch_reading_ease": flesch_reading_ease,
      "flesch_kincaid_grade": flesch_kincaid_grade,
      "avg_syllables_per_word": avg_syllables_per_word,
      "polysyllabic_word_ratio": polysyllabic_word_ratio,
      "char_entropy": char_entropy,
      "word_entropy": word_entropy,
      "avg_word_repetition": avg_word_repetition,
      "compression_ratio": compression_ratio,
      "starts_with_emoji": starts_with_emoji,
      "ends_with_emoji": ends_with_emoji,
      "starts_with_url": starts_with_url,
      "ends_with_url": ends_with_url,
      "contains_pipe_or_bullet": contains_pipe_or_bullet,
      "contains_call_to_action": contains_call_to_action,
      "contains_ai_phrase": contains_ai_phrase,
      "function_word_ratio": function_word_ratio,
      "noun_ratio": noun_ratio,
      "verb_ratio": verb_ratio,
      "pronoun_ratio": pronoun_ratio,
      "adjective_ratio": adjective_ratio,
      "contains_repeated_chars": contains_repeated_chars,
      "is_retweet": is_retweet,
      "is_quote": is_quote
  }
  
def extract_basic_text_features(text):
  if not isinstance(text, str) or text.strip() == "":
    return {
        "is_present": False,
        "length": None,
        "digit_ratio": None,
        "uppercase_ratio": None,
        "lowercase_ratio": None,
        "special_char_ratio": None,
        "contains_bot_word_or_hashtag": False,
        "char_entropy": None
    }

  text = text.strip()
  char_len = len(text)

  # --- C: Character composition
  digits = len(re.findall(DIGIT_PATTERN, text))
  letters = re.findall(r"[A-Za-z]", text)
  uppercase = sum(1 for c in letters if c.isupper())
  lowercase = sum(1 for c in letters if c.islower())
  special_chars = len(re.findall(SPECIAL_CHAR_PATTERN, text))

  digit_ratio = digits / char_len
  uppercase_ratio = uppercase / len(letters) if letters else None
  lowercase_ratio = lowercase / len(letters) if letters else None
  special_char_ratio = special_chars / char_len

  # --- E: Semantic signals
  contains_bot_word_or_hashtag = bool(re.search(r"(?i)(\bbot\b|#\w*bot\b)", text))

  # --- G: Entropy & compression
  char_entropy = entropy(list(Counter(text).values()), base=2)

  return {
      "is_present": True,
      "length": char_len,
      "digit_ratio": digit_ratio,
      "uppercase_ratio": uppercase_ratio,
      "lowercase_ratio": lowercase_ratio,
      "special_char_ratio": special_char_ratio,
      "contains_bot_word_or_hashtag": contains_bot_word_or_hashtag,
      "char_entropy": char_entropy
  }

In [12]:
import re
from emoji import demojize
from nltk.tokenize import TweetTokenizer

USER_PATTERN = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
URL_PATTERN = r"(https?://[^\s]+|www\.[^\s]+)"
EMAIL_PATTERN = r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"

tokenizer = TweetTokenizer()

def normalize_entities(text):
  if not isinstance(text, str) or text.strip() == "":
    return text
  
  text = re.sub(EMAIL_PATTERN, "<EMAIL>", text)
  text = re.sub(URL_PATTERN, "<URL>", text)
  text = re.sub(USER_PATTERN, "<USER>", text)

  return text

def normalize_token(token):
    token = token.replace("’", "'").replace("…", "...")

    if token == "<USER>":
        return "@USER"
    if token == "<URL>":
        return "HTTPURL"
    if len(token) == 1:
        return demojize(token)

    return token

def normalize_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    text = text.strip()
    text = normalize_entities(text)

    tokens = tokenizer.tokenize(text)
    norm_tweet = " ".join(normalize_token(t) for t in tokens)

    # contractions
    norm_tweet = (
        norm_tweet.replace("cannot ", "can not ")
                  .replace(" n't ", " n't ")
                  .replace("ca n't", "can't")
                  .replace("ai n't", "ain't")
    )

    # verb contractions
    norm_tweet = (
        norm_tweet.replace(" 'm ", " 'm ")
                  .replace(" 're ", " 're ")
                  .replace(" 's ", " 's ")
                  .replace(" 'll ", " 'll ")
                  .replace(" 'd ", " 'd ")
                  .replace(" 've ", " 've ")
    )

    # time expressions
    norm_tweet = (
        norm_tweet.replace(" p . m .", " p.m.")
                  .replace(" p . m ", " p.m ")
                  .replace(" a . m .", " a.m.")
                  .replace(" a . m ", " a.m ")
    )

    return " ".join(norm_tweet.split())

In [13]:
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SentenceEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="vinai/bertweet-base"):
      self.model_name = model_name
      self.model = AutoModel.from_pretrained(self.model_name)
      self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
      self.model.to(device)
      self.model.eval()

    def fit(self, X, y=None):
      return self

    def transform(self, X):
      if isinstance(X, pd.Series):
        X = X.values

      out = np.empty((len(X), 1), dtype=object)
      batch_size = 1024

      loader = DataLoader(
          X,
          batch_size=batch_size,
          shuffle=False
          )
      embeddings = []

      with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding text"):
          inputs = self.tokenizer(
              batch,
              padding=True,
              truncation=True,
              max_length=64,
              return_tensors="pt"
              )
          inputs = {k: v.to(device) for k, v in inputs.items()}  # move tensors to GPU

          outputs = self.model(**inputs)
#          batch_embeddings = outputs.last_hidden_state[:, 0, :]  # (n_samples, 768)

          last_hidden = outputs.last_hidden_state  # (B, T, 768)
          attention_mask = inputs["attention_mask"].unsqueeze(-1)  # (B, T, 1)

          masked_hidden = last_hidden * attention_mask
          batch_embeddings = (
            masked_hidden.sum(dim=1) / attention_mask.sum(dim=1)
          )  # (B, 768)
                    
          embeddings.append(batch_embeddings.cpu())

      embeddings = torch.cat(embeddings, dim=0)  # shape: (N, 768)
      out[:, 0] = list(embeddings.numpy())  # each row is a 768-D array

      torch.cuda.empty_cache()
      return out

In [7]:
user_df = pd.read_parquet(f"../../02_data/user_df.parquet", engine='pyarrow')
user_df.head(1)

Unnamed: 0,id,name,username,created_at,description,url,cashtags,hashtags,mentions,urls,location,pinned_tweet_id,profile_image_url,protected,followers_count,following_count,listed_count,tweet_count,verified
0,u1217628182611927040,Boaz Barak,boazbaraktcs,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,https://t.co/BoMip9FF17,,,,"[{'display_url': 'windowsontheory.org', 'end':...","Cambridge, MA",,https://pbs.twimg.com/profile_images/125226236...,False,7316,215,69,3098,False


In [8]:
user_df_1 = extract_user_features(user_df)
user_df_1.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332


In [9]:
name_feature_df = user_df_1["name"].apply(extract_basic_text_features).apply(pd.Series)
name_feature_df.rename(columns={c: f"name_{c}" for c in name_feature_df.columns}, inplace=True)

user_df_2 = pd.concat([user_df_1, name_feature_df], axis=1)
user_df_2.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439


In [10]:
username_feature_df = user_df_2["username"].apply(extract_basic_text_features).apply(pd.Series)
username_feature_df.rename(columns={c: f"username_{c}" for c in username_feature_df.columns}, inplace=True)

user_df_3 = pd.concat([user_df_2, username_feature_df], axis=1)
user_df_3.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055


In [None]:
joblib.dump(user_df_3, f"../../02_data/user_df_3.joblib")

['../../02_data/user_df_3.joblib']

In [None]:
user_df_3 = joblib.load(f"../../02_data/user_df_3.joblib")
user_df_3.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055


In [9]:
desc_feature_df = user_df_3["description"].apply(extract_text_features).apply(pd.Series)
desc_feature_df.rename(columns={c: f"desc_{c}" for c in desc_feature_df.columns}, inplace=True)

desc_feature_df.drop(columns=["desc_is_retweet", "desc_is_quote"], inplace=True)

user_df_4 = pd.concat([user_df_3, desc_feature_df], axis=1)
user_df_4["description_normalized"] = user_df_4["description"].apply(normalize_text)

user_df_4.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,description_normalized
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055,True,56.0,8.0,2.0,4.0,5.5,3.082207,0.875,2.474874,0.125,0.75,0.0,0.227273,0.772727,0.089286,0.089286,0.125,0.0,0.0,0.0,False,2.0,True,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.1,23.0,11.045,2.125,0.375,4.389035,2.75,1.142857,1.053571,False,False,False,True,False,False,False,0.125,0.375,0.0,0.0,0.375,False,Theoretical Computer Scientist . See also HTTP...


In [10]:
joblib.dump(user_df_4, f"../../02_data/user_df_4.joblib")

['../../02_data/user_df_4.joblib']

In [7]:
user_df_4 = joblib.load(f"../../02_data/user_df_4.joblib")
user_df_4.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,description_normalized
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055,True,56.0,8.0,2.0,4.0,5.5,3.082207,0.875,2.474874,0.125,0.75,0.0,0.227273,0.772727,0.089286,0.089286,0.125,0.0,0.0,0.0,False,2.0,True,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.1,23.0,11.045,2.125,0.375,4.389035,2.75,1.142857,1.053571,False,False,False,True,False,False,False,0.125,0.375,0.0,0.0,0.375,False,Theoretical Computer Scientist . See also HTTP...


In [9]:
label_df = pd.read_csv(f"../../02_data/label.csv")
label_df.head()

Unnamed: 0,id,label
0,u1217628182611927040,human
1,u2664730894,human
2,u1266703520205549568,human
3,u1089159225148882949,human
4,u36741729,bot


In [11]:
user_df_5 = user_df_4.merge(
    label_df,
    on="id",
    how="left"
    )
user_df_5.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,description_normalized,label
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055,True,56.0,8.0,2.0,4.0,5.5,3.082207,0.875,2.474874,0.125,0.75,0.0,0.227273,0.772727,0.089286,0.089286,0.125,0.0,0.0,0.0,False,2.0,True,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.1,23.0,11.045,2.125,0.375,4.389035,2.75,1.142857,1.053571,False,False,False,True,False,False,False,0.125,0.375,0.0,0.0,0.375,False,Theoretical Computer Scientist . See also HTTP...,human


In [12]:
joblib.dump(user_df_5, f"../../02_data/user_df_5.joblib")

['../../02_data/user_df_5.joblib']

In [8]:
user_df_5 = joblib.load(f"../../02_data/user_df_5.joblib")
user_df_5.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,description_normalized,label
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055,True,56.0,8.0,2.0,4.0,5.5,3.082207,0.875,2.474874,0.125,0.75,0.0,0.227273,0.772727,0.089286,0.089286,0.125,0.0,0.0,0.0,False,2.0,True,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.1,23.0,11.045,2.125,0.375,4.389035,2.75,1.142857,1.053571,False,False,False,True,False,False,False,0.125,0.375,0.0,0.0,0.375,False,Theoretical Computer Scientist . See also HTTP...,human


In [11]:
CHUNK_SIZE = 300_000

i = 3

START = i * CHUNK_SIZE
END = min(START + CHUNK_SIZE, len(user_df_5))

print(f"Start: {START}, End: {END}")

Start: 900000, End: 1000000


In [12]:
import gc

chunk = user_df_5.iloc[START:END].copy()

embedder = SentenceEmbedder()
chunk["desc_embedding"] = embedder.transform(chunk["description_normalized"])[:, 0]

joblib.dump(chunk, f"../../02_data/user_df_6_{START}_{END}.joblib")

del chunk
gc.collect()

print(f"Saved embeddings {START}:{END}")

Embedding text: 100%|██████████| 98/98 [1:27:40<00:00, 53.68s/it]


Saved embeddings 900000:1000000


In [7]:
import os
import joblib

DATA_DIR = "../../02_data"
pattern = re.compile(r"user_df_6_(\d+)_(\d+)\.joblib")

files = [f for f in os.listdir(DATA_DIR) if pattern.match(f)]
files_sorted = sorted(files, key=lambda f: int(pattern.match(f).group(1)))
files_sorted

['user_df_6_0_300000.joblib',
 'user_df_6_300000_600000.joblib',
 'user_df_6_600000_900000.joblib',
 'user_df_6_900000_1000000.joblib']

In [8]:
dfs = []

for f in files_sorted:
    path = os.path.join(DATA_DIR, f)
    print(f"Loading {f}")
    df = joblib.load(path)
    dfs.append(df)

user_df_6 = pd.concat(dfs, axis=0, ignore_index=True)
joblib.dump(user_df_6, f"../../02_data/user_df_6.joblib")

Loading user_df_6_0_300000.joblib
Loading user_df_6_300000_600000.joblib
Loading user_df_6_600000_900000.joblib
Loading user_df_6_900000_1000000.joblib


['../../02_data/user_df_6.joblib']

In [7]:
user_df_6 = joblib.load(f"../../02_data/user_df_6.joblib")
user_df_6.head(1)

Unnamed: 0,id,name,username,description,is_protected,is_verified,has_url,has_location,has_pinned_tweet,has_profile_image,cashtag_in_description_count,hashtag_in_description_count,mention_in_description_count,url_in_description_count,has_cashtag_in_description,has_hashtag_in_description,has_mention_in_description,has_url_in_description,account_age,follower_rate,following_rate,listed_rate,tweet_rate,name_is_present,name_length,name_digit_ratio,name_uppercase_ratio,name_lowercase_ratio,name_special_char_ratio,name_contains_bot_word_or_hashtag,name_char_entropy,username_is_present,username_length,username_digit_ratio,username_uppercase_ratio,username_lowercase_ratio,username_special_char_ratio,username_contains_bot_word_or_hashtag,username_char_entropy,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,description_normalized,label,desc_embedding
0,u1217628182611927040,Boaz Barak,boazbaraktcs,Theoretical Computer Scientist. See also https...,False,False,True,True,False,True,0,0,0,2,False,False,False,True,715.914641,10.219095,0.300315,0.09638,4.327332,True,10.0,0.0,0.222222,0.777778,0.0,False,2.646439,True,12,0.0,0.0,1.0,0.0,False,3.022055,True,56.0,8.0,2.0,4.0,5.5,3.082207,0.875,2.474874,0.125,0.75,0.0,0.227273,0.772727,0.089286,0.089286,0.125,0.0,0.0,0.0,False,2.0,True,0.0,0.0,0.0,False,False,0.0,0.0,1.0,0.1,23.0,11.045,2.125,0.375,4.389035,2.75,1.142857,1.053571,False,False,False,True,False,False,False,0.125,0.375,0.0,0.0,0.375,False,Theoretical Computer Scientist . See also HTTP...,human,"[0.06274554, 0.11472696, 0.11544624, -0.019800..."


In [8]:
user_df_embedding = user_df_6[["id", "description_normalized", "desc_embedding"]].copy()
user_df_embedding.head(1)

Unnamed: 0,id,description_normalized,desc_embedding
0,u1217628182611927040,Theoretical Computer Scientist . See also HTTP...,"[0.06274554, 0.11472696, 0.11544624, -0.019800..."


In [9]:
joblib.dump(user_df_embedding, f"../../02_data/user_df_embedding.joblib")

['../../02_data/user_df_embedding.joblib']

In [10]:
tweet_df = joblib.load(f"../../02_data/tweet_features_3.joblib")
tweet_df.head(1)

Unnamed: 0,author_id,id,text,created_at,in_reply_to_user_id,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label,is_present,length,num_words,num_sentences,avg_sentence_length,avg_word_length,std_word_length,unique_word_ratio,guiraud_index,repetition_ratio,hapax_ratio,digit_ratio,uppercase_ratio,lowercase_ratio,special_char_ratio,punctuation_ratio,whitespace_ratio,emoji_count,emoji_ratio,mention_count,contains_mention,url_count,contains_url,hashtag_count,cashtag_count,email_count,contains_bot_word_or_hashtag,contains_ai_hashtag,sentiment,sentiment_abs,sentiment_neutrality,sentiment_subjectivity,flesch_reading_ease,flesch_kincaid_grade,avg_syllables_per_word,polysyllabic_word_ratio,char_entropy,word_entropy,avg_word_repetition,compression_ratio,starts_with_emoji,ends_with_emoji,starts_with_url,ends_with_url,contains_pipe_or_bullet,contains_call_to_action,contains_ai_phrase,function_word_ratio,noun_ratio,verb_ratio,pronoun_ratio,adjective_ratio,contains_repeated_chars,is_retweet,is_quote,text_normalized,embedding
0,u1001495628738957312,t1502310945158275074,Join us for a special screening of the documen...,2022-03-11 15:50:15+00:00,,0,0,1,0.0,0.0,1,0,True,153,27,1,27.0,4.037037,2.71459,0.777778,4.041452,0.222222,0.666667,0.013072,0.238532,0.761468,0.111111,0.084967,0.163399,1,0.006536,3,True,1,True,1,0,0,False,False,0.636,0.636,0.364,0.285714,63.486154,7.633846,1.538462,0.074074,4.914588,4.226567,1.285714,0.986928,False,False,False,True,False,True,False,0.407407,0.37037,0.037037,0.037037,0.111111,False,False,False,Join us for a special screening of the documen...,"[0.010525476, -0.11652835, 0.047724664, 0.0785..."


In [11]:
tweet_df_embedding = tweet_df[["id", "text_normalized", "embedding"]].copy()
tweet_df_embedding.head(1)

Unnamed: 0,id,text_normalized,embedding
0,t1502310945158275074,Join us for a special screening of the documen...,"[0.010525476, -0.11652835, 0.047724664, 0.0785..."


In [12]:
joblib.dump(tweet_df_embedding, f"../../02_data/tweet_df_embedding.joblib")

['../../02_data/tweet_df_embedding.joblib']

In [7]:
tweet_0_pt_1 = pd.read_parquet("../../02_data/raw/tweet_0_pt_1.parquet", engine='fastparquet')
tweet_0_pt_2 = pd.read_parquet("../../02_data/raw/tweet_0_pt_2.parquet", engine='fastparquet')

tweet_0 = pd.concat([tweet_0_pt_1, tweet_0_pt_2])
tweet_0.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope
0,391092102.0,,1502784561763295233,2022-03-12 23:12:14+00:00,t1502784561763295233,,en,0.0,,everyone,Twitter Web App,RT @ChelseaSTrust: Join the CST now.\n\n#Toget...,,,,,,,,,,,,,,0.0,0.0,0.0,42.0,,,


In [8]:
tweet_df_embedding = joblib.load(f"../../02_data/processed/tweet_df_embedding.joblib")
tweet_df_embedding.head(1)

Unnamed: 0,id,text_normalized,embedding
0,t1502310945158275074,Join us for a special screening of the documen...,"[0.010525476, -0.11652835, 0.047724664, 0.0785..."


In [9]:
tweet_0_wo_embedding = tweet_0[~tweet_0["id"].isin(tweet_df_embedding["id"].unique())]
tweet_0_wo_embedding.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope
1,7.942668e+17,,1502782357639770113,2022-03-12 23:04:53+00:00,t1502782713371238406,7.942668e+17,en,0.0,,everyone,Twitter for iPhone,By media law it was embargoed until 10.30pm to...,,,,,,,,,,,,,,160.0,0.0,1.0,3.0,,,


In [11]:
joblib.dump(tweet_0_wo_embedding, f"../../02_data/tweet_0_wo_embedding.joblib")

['../../02_data/tweet_0_wo_embedding.joblib']

In [7]:
tweet_0_wo_embedding = joblib.load(f"../../02_data/tweet_0_wo_embedding.joblib")
tweet_0_wo_embedding.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope
1,7.942668e+17,,1502782357639770113,2022-03-12 23:04:53+00:00,t1502782713371238406,7.942668e+17,en,0.0,,everyone,Twitter for iPhone,By media law it was embargoed until 10.30pm to...,,,,,,,,,,,,,,160.0,0.0,1.0,3.0,,,


In [8]:
tweet_0_wo_embedding_1 = tweet_0_wo_embedding.copy()
tweet_0_wo_embedding_1["text_normalized"] = tweet_0_wo_embedding_1["text"].apply(normalize_text)

tweet_0_wo_embedding_1.head()

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope,text_normalized
1,7.942668e+17,,1502782357639770113,2022-03-12 23:04:53+00:00,t1502782713371238406,7.942668e+17,en,0.0,,everyone,Twitter for iPhone,By media law it was embargoed until 10.30pm to...,,,,,,,,,,,,,,160.0,0.0,1.0,3.0,,,,By media law it was embargoed until 10.30 pm t...
2,7.942668e+17,,1502782357639770113,2022-03-12 23:03:29+00:00,t1502782357639770113,,en,0.0,,everyone,Twitter for iPhone,Important to add these comments from Tuchel we...,,,,,,,,,,,,,,530.0,0.0,7.0,31.0,,,,Important to add these comments from Tuchel we...
3,4504719000.0,,1502781496343638028,2022-03-12 23:00:03+00:00,t1502781496343638028,,en,0.0,,everyone,Twitter Web App,More than 300 parties are said to have express...,,,,,,,,,,,,,,2585.0,35.0,48.0,144.0,,,,More than 300 parties are said to have express...
4,4504719000.0,,1502781310875676674,2022-03-12 22:59:19+00:00,t1502781310875676674,,en,0.0,,everyone,Twitter Web App,"Government spokesman: \n\n""We have said all al...",,,,,,,,,,,,,,317.0,2.0,7.0,25.0,,,,"Government spokesman : "" We have said all alon..."
5,4504719000.0,,1502781198413750272,2022-03-12 22:58:52+00:00,t1502781198413750272,,en,0.0,,everyone,Twitter Web App,Sources: The new terms [of licence] will allow...,,,,,,,,,,,,,,1826.0,15.0,18.0,148.0,,,,Sources : The new terms [ of licence ] will al...


In [9]:
joblib.dump(tweet_0_wo_embedding_1, f"../../02_data/tweet_0_wo_embedding_1.joblib")

['../../02_data/tweet_0_wo_embedding_1.joblib']

In [7]:
tweet_0_wo_embedding_1 = joblib.load(f"../../02_data/tweet_0_wo_embedding_1.joblib")
tweet_0_wo_embedding_1.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope,text_normalized
1,7.942668e+17,,1502782357639770113,2022-03-12 23:04:53+00:00,t1502782713371238406,7.942668e+17,en,0.0,,everyone,Twitter for iPhone,By media law it was embargoed until 10.30pm to...,,,,,,,,,,,,,,160.0,0.0,1.0,3.0,,,,By media law it was embargoed until 10.30 pm t...


In [8]:
tweet_0_wo_embedding_en = tweet_0_wo_embedding_1[tweet_0_wo_embedding_1["lang"] == "en"].copy()
tweet_0_wo_embedding_en.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,attachments.media_keys,attachments.poll_ids,entities.annotations,entities.cashtags,entities.hashtags,entities.media,entities.mentions,entities.symbols,entities.urls,entities.user_mentions,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope,text_normalized
1,7.942668e+17,,1502782357639770113,2022-03-12 23:04:53+00:00,t1502782713371238406,7.942668e+17,en,0.0,,everyone,Twitter for iPhone,By media law it was embargoed until 10.30pm to...,,,,,,,,,,,,,,160.0,0.0,1.0,3.0,,,,By media law it was embargoed until 10.30 pm t...


In [18]:
tweet_0_wo_embedding_en["lang"].unique()

array(['en'], dtype=object)

In [27]:
CHUNK_SIZE = 50_000

i = 0

START = i * CHUNK_SIZE
END = min(START + CHUNK_SIZE, len(tweet_0_wo_embedding_en))

print(f"Start: {START}, End: {END}")

Start: 0, End: 50000


In [28]:
import gc

chunk = tweet_0_wo_embedding_1.iloc[START:END].copy()

embedder = SentenceEmbedder()
chunk["embedding"] = embedder.transform(chunk["text_normalized"])[:, 0]

joblib.dump(chunk, f"../../02_data/tweet_0_wo_embedding_2_{START}_{END}.joblib")

del chunk
gc.collect()

print(f"Saved embeddings {START}:{END}")

Embedding text:   2%|▏         | 1/49 [00:44<35:39, 44.57s/it]

: 

In [15]:
user = pd.read_json("../../02_data/raw/user.jsonl", lines=True)

user = pd.concat([
    user,
    pd.json_normalize(user["entities"]),
    pd.json_normalize(user["public_metrics"])
    ], axis=1)

user.drop(columns=["entities", "public_metrics"], inplace=True)
user.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,verified,withheld,url.urls,description.urls,description.mentions,description.hashtags,description.cashtags,followers_count,following_count,tweet_count,listed_count
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,https://t.co/BoMip9FF17,boazbaraktcs,False,,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...","[{'start': 41, 'end': 64, 'url': 'https://t.co...",,,,7316,215,3098,69


In [16]:
joblib.dump(user, f"../../02_data/raw/user.joblib")

['../../02_data/raw/user.joblib']

In [5]:
tweet_0 = pd.read_json("../../02_data/raw/tweet_0.jsonl", lines=True, nrows=1)

tweet_0 = pd.concat([
    tweet_0,
    pd.json_normalize(tweet_0["entities"]),
    pd.json_normalize(tweet_0["public_metrics"])
    ], axis=1)

tweet_0.drop(columns=["entities", "public_metrics"], inplace=True)
tweet_0.head(1)

Unnamed: 0,attachments,author_id,context_annotations,conversation_id,created_at,geo,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld,hashtags,symbols,user_mentions,urls,media,retweet_count,reply_count,like_count,quote_count
0,,1304855289208819713,,1497798545872588801,2022-02-27 04:59:35+00:00,,t1497798545872588801,976935699793539073,en,False,,,"<a href=""http://twitter.com/download/android"" ...",@phaseknight_ Although I didn't base this sket...,,[],[],"[{'screen_name': 'phaseknight_', 'name': 'sapp...",[],"[{'id': 1497798542869422086, 'id_str': '149779...",0,,8,


In [None]:


tweet_df = (
    tweet_0[tw_cols_to_select]
    .rename(columns={
        "public_metrics.like_count": "like_count",
        "public_metrics.retweet_count": "retweet_count"
        })
)
tweet_df.head(1)

Unnamed: 0,author_id,created_at,id,in_reply_to_user_id,lang,source,text,like_count,retweet_count
0,u391092102,2022-03-12 23:12:14+00:00,t1502784561763295233,,en,Twitter Web App,RT @ChelseaSTrust: Join the CST now.\n\n#Toget...,0.0,42.0


In [9]:
joblib.dump(tweet_df, f"../../02_data/processed/tweet_df.joblib")

['../../02_data/processed/tweet_df.joblib']