In [1]:
# Automatically load changes in dependency files (may be unnecessary here, but useful tool in case you're modifying packages that this file relies on)
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


import os

# You may have to install torch as well in order to run some of the later tokenization cells

RANDOM_STATE = 42

In [5]:
# Recursive find for bot data from current directory ($SCRATCHDIR on ISAAC machines)
path = "/lustre/isaac/scratch/jdosch1/COSC325_Final"
'''for dirpath, dirnames, filenames in os.walk("$SCRATCHDIR"):
    for dirname in dirnames:
        if dirname == "hashtag.json":
            path = os.path.join(dirpath, dirname)

assert path is not None, "botometer-feedback directory not found" 
'''
print(f"Path to csv file: {path}")

user_path = os.path.join(path, "user.json")
label_path = os.path.join(path, "label.csv")


Path to csv file: /lustre/isaac/scratch/jdosch1/COSC325_Final


In [4]:
# Read in data into 2 dataframes and merge on the user id column
df_node = pd.read_json(user_path)
df_label = pd.read_csv(label_path)

df = pd.merge(df_node, df_label, on='id')

df

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld,label
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,"{'followers_count': 7316, 'following_count': 2...",https://t.co/BoMip9FF17,boazbaraktcs,False,,human
1,2014-07-02 17:56:46+00:00,creative _,,u2664730894,🎈,olawale 💨,,https://pbs.twimg.com/profile_images/147837638...,False,"{'followers_count': 123, 'following_count': 10...",,wale_io,False,,human
2,2020-05-30 12:10:45+00:00,👽,,u1266703520205549568,,panagiota_.b,,https://pbs.twimg.com/profile_images/142608606...,False,"{'followers_count': 3, 'following_count': 62, ...",,b_panagiota,False,,human
3,2019-01-26 13:52:49+00:00,mama to maya. ABIM research pathway fellow @UV...,"{'description': {'mentions': [{'start': 43, 'e...",u1089159225148882949,"Charlottesville, VA","Jacqueline Hodges, MD MPH",,https://pbs.twimg.com/profile_images/130229171...,False,"{'followers_count': 350, 'following_count': 57...",,jachodges_md,False,,human
4,2009-04-30 19:01:42+00:00,Father / SWT Alumnus / Longhorn Fan,,u36741729,United States,Matthew Stubblefield,,https://pbs.twimg.com/profile_images/145808462...,True,"{'followers_count': 240, 'following_count': 29...",,Matthew_Brody,False,,bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2013-02-05 14:50:17+00:00,イラストACは高品質イラストアート/年賀状等が全無料DL可能♪AIベクター・EPS形式素材全...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1151138281,↓利用者600万人無料素材サイト↓　商用利用編集ＯＫ表記不要,フリー素材集かわいい無料イラストAC/おしゃれフレーム枠★IllustACイラストレーター,1.301109e+18,https://pbs.twimg.com/profile_images/139750409...,False,"{'followers_count': 1877, 'following_count': 2...",https://t.co/L6PE11Blkl,Illustratorjpn,False,,human
999996,2013-04-09 12:09:34+00:00,next➬未定 紫･緑ﾃﾞｨｯｷ 色々な曲聴きます,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1339035361,OKAYAMA CITY,りょうやん,6.067787e+17,https://pbs.twimg.com/profile_images/135842413...,False,"{'followers_count': 13952, 'following_count': ...",https://t.co/NjDtATyqGc,_y3oa,False,,human
999997,2011-06-16 20:09:29+00:00,Heart of a lion with a Mind of a maniac. Louis...,,u318636852,"Lake Charles, LA",Gavin Cecchini,,https://pbs.twimg.com/profile_images/781352355...,False,"{'followers_count': 13743, 'following_count': ...",,GavinCecchini2,True,,human
999998,2009-05-30 00:25:19+00:00,"Marketplace Minister, Christ follower, Indepen...","{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u43443354,Rockhampton Australia,Martin Allan,,https://pbs.twimg.com/profile_images/131699997...,False,"{'followers_count': 2460, 'following_count': 2...",https://t.co/r3R5Bkng9m,MartinfromOz,False,,human


In [5]:
df.dtypes

created_at           datetime64[ns, UTC]
description                       object
entities                          object
id                                object
location                          object
name                              object
pinned_tweet_id                  float64
profile_image_url                 object
protected                           bool
public_metrics                    object
url                               object
username                          object
verified                            bool
withheld                          object
label                             object
dtype: object

In [6]:
# Change created at to Unix epoch time in seconds
df["created_at"] = pd.to_datetime(df["created_at"]).astype("int64") // 10**9 # Convert to seconds

# Drop columns with urls or identifying information (username, id, etc)
df.drop(columns=['entities', 'id', 'profile_image_url', 'url'])
df.head()

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld,label
0,1579140175,Theoretical Computer Scientist. See also https...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,"{'followers_count': 7316, 'following_count': 2...",https://t.co/BoMip9FF17,boazbaraktcs,False,,human
1,1404323806,creative _,,u2664730894,🎈,olawale 💨,,https://pbs.twimg.com/profile_images/147837638...,False,"{'followers_count': 123, 'following_count': 10...",,wale_io,False,,human
2,1590840645,👽,,u1266703520205549568,,panagiota_.b,,https://pbs.twimg.com/profile_images/142608606...,False,"{'followers_count': 3, 'following_count': 62, ...",,b_panagiota,False,,human
3,1548510769,mama to maya. ABIM research pathway fellow @UV...,"{'description': {'mentions': [{'start': 43, 'e...",u1089159225148882949,"Charlottesville, VA","Jacqueline Hodges, MD MPH",,https://pbs.twimg.com/profile_images/130229171...,False,"{'followers_count': 350, 'following_count': 57...",,jachodges_md,False,,human
4,1241118102,Father / SWT Alumnus / Longhorn Fan,,u36741729,United States,Matthew Stubblefield,,https://pbs.twimg.com/profile_images/145808462...,True,"{'followers_count': 240, 'following_count': 29...",,Matthew_Brody,False,,bot


In [7]:
print(df[df['withheld'].notna()].shape)
print(df[df['pinned_tweet_id'].notna()].shape)

# Significant number of both of these missing, we will drop these too
df = df.drop(columns=['pinned_tweet_id', 'withheld'])

(59, 15)
(389865, 15)


In [8]:
df.dtypes

created_at            int64
description          object
entities             object
id                   object
location             object
name                 object
profile_image_url    object
protected              bool
public_metrics       object
url                  object
username             object
verified               bool
label                object
dtype: object

In [10]:
print(f"Number of human entries: {df[df['label'] == 'human'].shape[0]}")
print(f"Number of bot entries: {df[df['label'] == 'bot'].shape[0]}")

Number of human entries: 860057
Number of bot entries: 139943


In [11]:
# We'll narrow the dataset down to 5000 bots and 5000 normal users, the dataset file should be ~ 1 GB
human_samples = df[df['label'] == 'human'].sample(
    n=5000,
    random_state=RANDOM_STATE
)

bot_samples = df[df['label'] == 'bot'].sample(
    n=5000,
    random_state=RANDOM_STATE
)

# Concatentate two dataframes back together and shuffle
df = pd.concat([human_samples, bot_samples], ignore_index=True)
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

df.head()

Unnamed: 0,created_at,description,entities,id,location,name,profile_image_url,protected,public_metrics,url,username,verified,label
0,1615388944,Pro America | Pro Joe Biden | Pro Human Rights...,,u1369666304819535884,"New York, NY",🌸American Woman🌸,https://pbs.twimg.com/profile_images/148811519...,False,"{'followers_count': 88, 'following_count': 172...",,AmericanWomanMC,False,bot
1,1601646654,"Hello, I'm from Ranchi, Jharkhand. I'm a Journ...","{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1312027120785219585,"Ranchi, India",HIMANSHU KUMAR DEO,https://pbs.twimg.com/profile_images/147906643...,False,"{'followers_count': 37, 'following_count': 437...",https://t.co/Wz2nVg1fjW,HIMANSHU2732003,False,human
2,1540439944,Mama | wife | Nurse Practitioner & Clinical Sc...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1055307732226531328,"Boston, MA","Erin C. Sanders, MSN, WHNP-BC (She/Her)",https://pbs.twimg.com/profile_images/106364911...,False,"{'followers_count': 14761, 'following_count': ...",https://t.co/nYa84sqxcf,ErinSandersNP,False,human
3,1281992242,Recruiting Contributor for @BON_SBNation. UAB ...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u179233659,United States,Daniel Seahorn,https://pbs.twimg.com/profile_images/149190910...,False,"{'followers_count': 3183, 'following_count': 1...",https://t.co/hWD0QYWy7p,DanielSeahorn,False,human
4,1245809190,♚AKA♚ I will block you weirdos. Please don’t p...,,u50179868,Philly/NYC,Brittany♚,https://pbs.twimg.com/profile_images/148615645...,False,"{'followers_count': 9452, 'following_count': 5...",,britshaniece,False,human


In [12]:
# Drop urls, as these are not as useful as other information within the dataset
processed_df = df.drop(columns=['url', 'profile_image_url', 'entities'])

# Expand public_metrics values each to its own column
metrics_df = processed_df["public_metrics"].apply(pd.Series)

processed_df = pd.concat([processed_df.drop(columns='public_metrics'), metrics_df], axis=1)

processed_df.head()


Unnamed: 0,created_at,description,id,location,name,protected,username,verified,label,followers_count,following_count,tweet_count,listed_count
0,1615388944,Pro America | Pro Joe Biden | Pro Human Rights...,u1369666304819535884,"New York, NY",🌸American Woman🌸,False,AmericanWomanMC,False,bot,88,172,3083,1
1,1601646654,"Hello, I'm from Ranchi, Jharkhand. I'm a Journ...",u1312027120785219585,"Ranchi, India",HIMANSHU KUMAR DEO,False,HIMANSHU2732003,False,human,37,437,40,0
2,1540439944,Mama | wife | Nurse Practitioner & Clinical Sc...,u1055307732226531328,"Boston, MA","Erin C. Sanders, MSN, WHNP-BC (She/Her)",False,ErinSandersNP,False,human,14761,6917,10597,230
3,1281992242,Recruiting Contributor for @BON_SBNation. UAB ...,u179233659,United States,Daniel Seahorn,False,DanielSeahorn,False,human,3183,1096,410848,87
4,1245809190,♚AKA♚ I will block you weirdos. Please don’t p...,u50179868,Philly/NYC,Brittany♚,False,britshaniece,False,human,9452,572,428270,65


In [13]:
# Sanity check
print(f"Number of human entries: {df[df['label'] == 'human'].shape[0]}")
print(f"Number of bot entries: {df[df['label'] == 'bot'].shape[0]}")

Number of human entries: 5000
Number of bot entries: 5000


In [14]:
# Now need to one-hot encode location
label_encoder = LabelEncoder()

'''Print the length of Location before. We can see that some locations are repeated,
 but there are still 25k unique values. Some more preprocessing may be necessary here.

'''
print(f"{len(processed_df['location'])}")

processed_df["location"] = label_encoder.fit_transform(processed_df["location"])

len(processed_df["location"].unique())

10000


3471

In [15]:
processed_df.head()

Unnamed: 0,created_at,description,id,location,name,protected,username,verified,label,followers_count,following_count,tweet_count,listed_count
0,1615388944,Pro America | Pro Joe Biden | Pro Human Rights...,u1369666304819535884,1973,🌸American Woman🌸,False,AmericanWomanMC,False,bot,88,172,3083,1
1,1601646654,"Hello, I'm from Ranchi, Jharkhand. I'm a Journ...",u1312027120785219585,2333,HIMANSHU KUMAR DEO,False,HIMANSHU2732003,False,human,37,437,40,0
2,1540439944,Mama | wife | Nurse Practitioner & Clinical Sc...,u1055307732226531328,476,"Erin C. Sanders, MSN, WHNP-BC (She/Her)",False,ErinSandersNP,False,human,14761,6917,10597,230
3,1281992242,Recruiting Contributor for @BON_SBNation. UAB ...,u179233659,2859,Daniel Seahorn,False,DanielSeahorn,False,human,3183,1096,410848,87
4,1245809190,♚AKA♚ I will block you weirdos. Please don’t p...,u50179868,2228,Brittany♚,False,britshaniece,False,human,9452,572,428270,65


In [57]:
# Need to embed the string columns. However, we first need to append the Tweets to the dataframe
import ijson
import re
from collections import defaultdict

author_set = set(processed_df['id'])

# Map all potential substrings to the larger id for faster lookup
substr_map: dict[str, list[str]] = {}
for full_id in author_set:
    for num in re.findall(r'\d+', full_id):
        substr_map.setdefault(num, []).append(full_id)


user_tweets = defaultdict(list)

for i in range(9):
    file = os.path.join(path, f'tweet_{i}.json')
    with open(file=file, mode='rb') as f:
        for tweet in ijson.items(f, 'item'):
            author_id = tweet.get('author_id')
            # Check if this doesn't work first
            full_id = substr_map.get(str(author_id), [])
            assert len(full_id) <= 1, "maps to too many full ids"

            for matching_full_id in full_id:
                    text = tweet.get('text', '')
                    if text:
                        user_tweets[matching_full_id].append(text)

In [None]:
import math

max_len = max(len(arr) for arr in user_tweets.values())

for k, arr in user_tweets.items():
    pad = [math.nan] * (max_len - len(arr))
    user_tweets[k] = arr + pad
    
tweets = pd.DataFrame(user_tweets)

                                          u825789434  \
0                @Chvngeling https://t.co/ELfFaUAwTg   
1  RT @SamAllenX: yeah i'm into NFTs (nonbinary f...   
2             @quadkorps It’s mind boggling honestly   
3  American Christians would be far more effectiv...   
4  RT @gaia_writes: Being queer saved my life. Of...   

                                u1208734837546323968  \
0                     @Ph_Obidon At least he's happy   
1  RT @ilearnblock: Set your reminders, invite yo...   
2  I’m going to @ilearnblock’s upcoming Space. Wi...   
3   @ItzroyaleO We are already connected via twitter   
4          Out of every crisis, comes opportunities.   

                                u1452876872480555010  \
0  RT @fighter_divine: Top 10 gainers in February...   
1  Top 10 gainers in February 🔥🚀🌙\n\n#mafa #ironb...   
2  RT @fighter_divine: @MContent_  is live on @ME...   
3  @MContent_  is live on @MEXC_Global 🔥🥳🌙\n\n#MC...   
4  RT @Hydra72266132: @fighter_divine Market w

"\ntext_df = processed_df.merge(tweets, on='id', how='left')\n\ntext_df.head()"

In [67]:
tweets = tweets.T.reset_index().rename(columns={'index': 'id'})
tweets.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914
0,u825789434,@Chvngeling https://t.co/ELfFaUAwTg,RT @SamAllenX: yeah i'm into NFTs (nonbinary f...,@quadkorps It’s mind boggling honestly,American Christians would be far more effectiv...,RT @gaia_writes: Being queer saved my life. Of...,RT @dennisbhooper: I’ve had mangoes that were ...,would love it if white people would stop forci...,RT @discotekmedia: Street Sharks\n\nIt's time ...,RT @moonchilld36: 😂😭😭😭😭😭😭😭😭 https://t.co/HuGqL...,...,,,,,,,,,,
1,u1208734837546323968,@Ph_Obidon At least he's happy,"RT @ilearnblock: Set your reminders, invite yo...",I’m going to @ilearnblock’s upcoming Space. Wi...,@ItzroyaleO We are already connected via twitter,"Out of every crisis, comes opportunities.",RT @ilearnblock: Here's a book written by @Vos...,@ilearnblock It's a book not an article,Join me on Twitter Space on Thursday as we unv...,Thoughtful investing in quality cryptocurrency...,...,,,,,,,,,,
2,u1452876872480555010,RT @fighter_divine: Top 10 gainers in February...,Top 10 gainers in February 🔥🚀🌙\n\n#mafa #ironb...,RT @fighter_divine: @MContent_ is live on @ME...,@MContent_ is live on @MEXC_Global 🔥🥳🌙\n\n#MC...,RT @Hydra72266132: @fighter_divine Market will...,Russia owns 12% of cryptocurrencies 🤯🤯\n\nWill...,RT @fighter_divine: @Hotbit_news listed @bitge...,@Hotbit_news listed @bitgertbrise 🔥\n\n#Brise ...,RT @fighter_divine: @Shib_nobi has surpassed 6...,...,,,,,,,,,,
3,u848818578665070592,5/5 We want to do a burial service traditiona...,4/5 He was also mentally ill. The junta troop...,3/5 Another source also said they had first he...,2/5 ...Dawei. I was there at that time. At fir...,1/5 Families say the army has not returned the...,RT @aung_myo_minn: The Ministry of Human Right...,4/4 The perpetrator groups are the BGF Battali...,"3/4 Saw Bree, 70, Saw Henry, 46, Naw Bu Htoo, ...",2/4 In addition to the killings &amp; injuries...,...,,,,,,,,,,
4,u791632941210226693,IICE is pleased to announce today's Entreprene...,Congratulations to Mr. Ankur Guria (BS-MS fina...,In the publication that came in Molecular Phyl...,Congratulations to IISER Bhopal's final year B...,"The Earth Environment (TEE) Science seminar"" s...",RT @IndiaDST: .@iiserbhopal team develops orga...,RT @Healthjagaran: @iiserbhopal @AmerChemSocie...,RT @SciResMatters: Scientists from the @iiserb...,On the occasion of Indian National Science Da...,...,,,,,,,,,,


In [68]:
text_df = processed_df.merge(tweets, on='id', how='left')

text_df.head()

Unnamed: 0,created_at,description,id,location,name,protected,username,verified,label,followers_count,...,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914
0,1615388944,Pro America | Pro Joe Biden | Pro Human Rights...,u1369666304819535884,1973,🌸American Woman🌸,False,AmericanWomanMC,False,bot,88,...,,,,,,,,,,
1,1601646654,"Hello, I'm from Ranchi, Jharkhand. I'm a Journ...",u1312027120785219585,2333,HIMANSHU KUMAR DEO,False,HIMANSHU2732003,False,human,37,...,,,,,,,,,,
2,1540439944,Mama | wife | Nurse Practitioner & Clinical Sc...,u1055307732226531328,476,"Erin C. Sanders, MSN, WHNP-BC (She/Her)",False,ErinSandersNP,False,human,14761,...,,,,,,,,,,
3,1281992242,Recruiting Contributor for @BON_SBNation. UAB ...,u179233659,2859,Daniel Seahorn,False,DanielSeahorn,False,human,3183,...,,,,,,,,,,
4,1245809190,♚AKA♚ I will block you weirdos. Please don’t p...,u50179868,2228,Brittany♚,False,britshaniece,False,human,9452,...,,,,,,,,,,


In [69]:
text_df.to_parquet('data/revised_data.parquet')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


In [3]:
text_df = pd.read_parquet('data/revised_data.parquet')

text_df.head()

Unnamed: 0,created_at,description,id,location,name,protected,username,verified,label,followers_count,...,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914
0,1615388944,Pro America | Pro Joe Biden | Pro Human Rights...,u1369666304819535884,1973,🌸American Woman🌸,False,AmericanWomanMC,False,bot,88,...,,,,,,,,,,
1,1601646654,"Hello, I'm from Ranchi, Jharkhand. I'm a Journ...",u1312027120785219585,2333,HIMANSHU KUMAR DEO,False,HIMANSHU2732003,False,human,37,...,,,,,,,,,,
2,1540439944,Mama | wife | Nurse Practitioner & Clinical Sc...,u1055307732226531328,476,"Erin C. Sanders, MSN, WHNP-BC (She/Her)",False,ErinSandersNP,False,human,14761,...,,,,,,,,,,
3,1281992242,Recruiting Contributor for @BON_SBNation. UAB ...,u179233659,2859,Daniel Seahorn,False,DanielSeahorn,False,human,3183,...,,,,,,,,,,
4,1245809190,♚AKA♚ I will block you weirdos. Please don’t p...,u50179868,2228,Brittany♚,False,britshaniece,False,human,9452,...,,,,,,,,,,


In [4]:
# Too many features because of tweet values, need to condense them, we will use average embeddings for now
'''
https://datascience.stackexchange.com/questions/107462/why-does-averaging-word-embedding-vectors-exctracted-from-the-nn-embedding-laye
a) do we average embedddings of individual tweets (padded ofc) to get a user embedding?
b) could also do max pooling to get most important words
'''

'\nhttps://datascience.stackexchange.com/questions/107462/why-does-averaging-word-embedding-vectors-exctracted-from-the-nn-embedding-laye\na) do we average embedddings of individual tweets (padded ofc) to get a user embedding?\nb) could also do max pooling to get most important words\n'

In [None]:
# NOTE: this code will take forever to run in a regular environment, so sending this as job to ISAAC machines
import torch
from transformers import AutoTokenizer, AutoModel

print(f"cuda is available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)
model.eval() # Turn off dropout layers for inference

def embed_texts(texts, max_len=128, batch_size=256):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i: i + batch_size]

        # tokenize values
        enc = tokenizer(
            batch,
            padding='longest',
            truncation=True,
            max_length=max_len,
            return_tensors='pt', # return torch tensors
        )
        with torch.inference_mode():
            out = model(**enc)
        # Take out [CLS] token from last hidden state
        cls_vectors = out.last_hidden_state[:, 0, :].cpu()
        embs.append(cls_vectors)
    return torch.cat(embs, dim=0).numpy()

tweet_cols = [f'{i}' for i in range(1915)]

# Average user tweet embeddings to get one embedding per user
def embed_user_tweets(row):
    tweets = []
    for c in tweet_cols:
        txt = row[c]
        if pd.notna(txt) and isinstance(txt, str) and txt.lower() != 'none' and txt.strip():
            tweets.append(txt)
    if not tweets:
        # 0 vector
        return np.zeros(model.config.hidden_size, dtype=float)

    tweet_embs = embed_texts(tweets)

    # Average tweet values
    return tweet_embs.mean(axis=0)

user_vectors = text_df.apply(embed_user_tweets, axis = 1)
user_vectors.head()

cuda is available: False


KeyboardInterrupt: 

In [None]:
dvc = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Encode description, name, and username columns
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)
model.eval() # Turn off dropout layers for inference


def embed_text(text, max_len=128, batch_size=32):
    embs = []

    for i in tqdm(range(0, len(text), batch_size), desc='batch embedding'):
        batch = text[i:i+batch_size]
        enc = tokenizer(
            batch, 
            padding='max_length',
            max_length=max_len,
            truncation=True,
            return_tensors='pt'
        )

        enc = {k: v.to(device) for k, v in enc.items() }

        with torch.inference_mode():
            out = model(**enc)
        cls_vectors = out.last_hidden_state[:, 0, :].cpu()
        embs.append(cls_vectors)
    return torch.cat(embs, dim=0).numpy()

In [None]:
from tqdm import tqdm
tqdm.pandas()

text_df["description"] = text_df["description"].fillna("").astype(str)
text_df["description"] = text_df["description"].progress_apply(
    lambda txt: embed_text([txt], batch_size=128)[0]
)

text_df["description"].head()

batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
batch embedding: 100%|██████████| 1/1 [00:03<00:00,  3.10s/it]
batch embedding: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]
batch embedding: 100%|██████████| 1/1 [00:03<00:00,  3.50s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
batch embedding: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]
batch embedding: 100%|██████████| 1/1 [00:07<00:00,  7.80s/it]
batch embedding: 100%|██████████| 1/1 [00:09<00:00,  9.70s/it]
batch embedding: 100%|██████████| 1/1 [00:04<00:00,  4.50s/it]
batch embedding: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]
batch embedding: 100%|██████████| 1/1 [00:01<00:00,  1.

In [None]:
text_df["name"] = text_df["name"].fillna("").astype(str)
text_df["name"] = text_df["name"].progress_apply(
    lambda txt: embed_text([txt], batch_size=128)[0]
)

text_df["username"] = text_df["username"].fillna("").astype(str)
text_df["username"] = text_df["username"].progress_apply(
    lambda txt: embed_text([txt], batch_size=128)[0]
)