In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cosine_sim(a, b):
    if not isinstance(a, np.ndarray) or not isinstance(b, np.ndarray):
        return np.nan
    return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0, 0]

def add_tweet_temporal_and_semantic_features(df):
    df = df.copy()

    # --- Temporal features ---
    df["tweet_hour"] = df["created_at"].dt.hour
    df["tweet_day_of_week"] = df["created_at"].dt.dayofweek
    df["tweet_is_weekend"] = df["tweet_day_of_week"].isin([5, 6]).astype(int)

    # Cyclical encoding
    df["tweet_hour_sin"] = np.sin(2 * np.pi * df["tweet_hour"] / 24)
    df["tweet_hour_cos"] = np.cos(2 * np.pi * df["tweet_hour"] / 24)

    df["tweet_dow_sin"] = np.sin(2 * np.pi * df["tweet_day_of_week"] / 7)
    df["tweet_dow_cos"] = np.cos(2 * np.pi * df["tweet_day_of_week"] / 7)

    # --- Sort for sequential features ---
    df = df.sort_values(["author_id", "created_at"])

    # --- Inter-tweet timing ---
    df["tweet_time_delta_sec"] = (
        df.groupby("author_id")["created_at"]
        .diff()
        .dt.total_seconds()
    )

    # --- Previous embedding ---
    df["prev_embedding"] = (
        df.groupby("author_id")["embedding"]
        .shift(1)
    )

    df["tweet_cosine_sim_prev"] = df.apply(
        lambda row: cosine_sim(row["embedding"], row["prev_embedding"]),
        axis=1
    )

    # --- User centroid ---
    user_centroids = (
        df.groupby("author_id")["embedding"]
        .apply(lambda x: np.mean(np.vstack(x), axis=0))
    )

    df = df.join(
        user_centroids.rename("user_embedding_centroid"),
        on="author_id"
    )

    # --- Similarity to centroid ---
    df["tweet_cosine_sim_centroid"] = df.apply(
        lambda row: cosine_sim(
            row["embedding"], row["user_embedding_centroid"]
        ),
        axis=1
    )

    return df

In [3]:
import joblib

user_features = joblib.load(f"../../02_data/user_features_1.joblib")
tweet_features = joblib.load(f"../../02_data/tweet_features_1.joblib")

In [4]:
tweet_features.head(1)

Unnamed: 0,author_id,id,text,created_at,is_reply,is_sensitive,like_count,quote_count,reply_count,retweet_count,label,tweet_is_present,tweet_length,tweet_num_words,tweet_num_sentences,tweet_avg_sentence_length,tweet_avg_word_length,tweet_std_word_length,tweet_unique_word_ratio,tweet_guiraud_index,tweet_repetition_ratio,tweet_hapax_ratio,tweet_digit_ratio,tweet_uppercase_ratio,tweet_lowercase_ratio,tweet_special_char_ratio,tweet_punctuation_ratio,tweet_whitespace_ratio,tweet_emoji_count,tweet_emoji_ratio,tweet_mention_count,tweet_contains_mention,tweet_url_count,tweet_contains_url,tweet_hashtag_count,tweet_cashtag_count,tweet_email_count,tweet_contains_bot_word_or_hashtag,tweet_contains_ai_hashtag,tweet_sentiment,tweet_sentiment_abs,tweet_sentiment_neutrality,tweet_sentiment_subjectivity,tweet_flesch_reading_ease,tweet_flesch_kincaid_grade,tweet_avg_syllables_per_word,tweet_polysyllabic_word_ratio,tweet_char_entropy,tweet_word_entropy,tweet_avg_word_repetition,tweet_compression_ratio,tweet_starts_with_emoji,tweet_ends_with_emoji,tweet_starts_with_url,tweet_ends_with_url,tweet_contains_pipe_or_bullet,tweet_contains_call_to_action,tweet_contains_ai_phrase,tweet_function_word_ratio,tweet_noun_ratio,tweet_verb_ratio,tweet_pronoun_ratio,tweet_adjective_ratio,tweet_contains_repeated_chars,tweet_is_retweet,tweet_is_quote
0,u1001495628738957312,t1502310945158275074,"Join us for a special screening of the documentary #SAPELO and a Q&amp;A with the filmmakers on Thursday, March 31 at the @CarterCenter!üìΩÔ∏èüá®üá≠ @CarterLibrary @SWISS_FILMS https://t.co/53nsRtRI8u",2022-03-11 15:50:15+00:00,0,0,1,0.0,0.0,1,0,True,153,27,1,27.0,4.037037,2.71459,0.777778,4.041452,0.222222,0.666667,0.013072,0.238532,0.761468,0.111111,0.084967,0.163399,1,0.006536,3,True,1,True,1,0,0,False,False,0.636,0.636,0.364,0.285714,63.486154,7.633846,1.538462,0.074074,4.914588,4.226567,1.285714,0.986928,False,False,False,True,False,True,False,0.407407,0.37037,0.037037,0.037037,0.111111,False,False,False


In [5]:
if tweet_features.isnull().any().any():
    print("Missing values found in the dataset.")
    na_summary = tweet_features.isnull().sum().loc[lambda x: x > 0].to_frame(name='Missing Count')
    na_summary['Missing Percentage'] = (na_summary['Missing Count'] / tweet_features.shape[0]) * 100
    print("\nSummary of missing values:")
    print(na_summary)
else:
    print("No missing values found in the dataset.")

Missing values found in the dataset.

Summary of missing values:
                               Missing Count  Missing Percentage
quote_count                           796093           75.899847
reply_count                           796093           75.899847
tweet_avg_word_length                     25            0.002384
tweet_std_word_length                     25            0.002384
tweet_unique_word_ratio                   25            0.002384
tweet_guiraud_index                       25            0.002384
tweet_repetition_ratio                    25            0.002384
tweet_hapax_ratio                         25            0.002384
tweet_uppercase_ratio                      2            0.000191
tweet_lowercase_ratio                      2            0.000191
tweet_polysyllabic_word_ratio             25            0.002384
tweet_word_entropy                        25            0.002384
tweet_avg_word_repetition                 25            0.002384
tweet_function_word_ratio

In [6]:
tweet_features['quote_count_missing'] = tweet_features['quote_count'].isnull().astype(bool)
tweet_features['reply_count_missing'] = tweet_features['reply_count'].isnull().astype(bool)

zero_fill = [
    "quote_count",
    "reply_count",
    "tweet_unique_word_ratio",
    "tweet_repetition_ratio",
    "tweet_hapax_ratio",
    "tweet_uppercase_ratio",
    "tweet_lowercase_ratio",
    "tweet_polysyllabic_word_ratio",
    "tweet_word_entropy",
    "tweet_function_word_ratio",
    "tweet_noun_ratio",
    "tweet_verb_ratio",
    "tweet_pronoun_ratio",
    "tweet_adjective_ratio"
]

median_fill = [
    "tweet_avg_word_length",
    "tweet_std_word_length",
    "tweet_guiraud_index",
    "tweet_avg_word_repetition"
]

tweet_features[zero_fill] = tweet_features[zero_fill].fillna(0)
tweet_features[median_fill] = tweet_features[median_fill].apply(
    lambda x: x.fillna(x.median())
)

In [7]:
if tweet_features.isnull().any().any():
    print("Missing values found in the dataset.")
    na_summary = tweet_features.isnull().sum().loc[lambda x: x > 0].to_frame(name='Missing Count')
    na_summary['Missing Percentage'] = (na_summary['Missing Count'] / tweet_features.shape[0]) * 100
    print("\nSummary of missing values:")
    print(na_summary)
else:
    print("No missing values found in the dataset.")

No missing values found in the dataset.


In [8]:
for col in tweet_features.columns:
    print(f"{col}: {tweet_features[col].dtype}")

author_id: object
id: object
text: object
created_at: object
is_reply: int32
is_sensitive: int32
like_count: int64
quote_count: float64
reply_count: float64
retweet_count: int64
label: int64
tweet_is_present: bool
tweet_length: int64
tweet_num_words: int64
tweet_num_sentences: int64
tweet_avg_sentence_length: float64
tweet_avg_word_length: float64
tweet_std_word_length: float64
tweet_unique_word_ratio: float64
tweet_guiraud_index: float64
tweet_repetition_ratio: float64
tweet_hapax_ratio: float64
tweet_digit_ratio: float64
tweet_uppercase_ratio: float64
tweet_lowercase_ratio: float64
tweet_special_char_ratio: float64
tweet_punctuation_ratio: float64
tweet_whitespace_ratio: float64
tweet_emoji_count: int64
tweet_emoji_ratio: float64
tweet_mention_count: int64
tweet_contains_mention: bool
tweet_url_count: int64
tweet_contains_url: bool
tweet_hashtag_count: int64
tweet_cashtag_count: int64
tweet_email_count: int64
tweet_contains_bot_word_or_hashtag: bool
tweet_contains_ai_hashtag: bool
tw

In [9]:
bool_cols = ["is_reply", "is_sensitive"]
int_cols = ["quote_count", "reply_count"]

tweet_features[bool_cols] = tweet_features[bool_cols].astype("bool")
tweet_features[int_cols] = tweet_features[int_cols].astype("int64")

In [10]:
exclude_cols = {
    "author_id",
    "id",
    "text",
    "created_at",
    "label"
}

feature_cols = [
    c for c in tweet_features.columns
    if c not in exclude_cols
]

bool_cols = tweet_features[feature_cols].select_dtypes(include=[bool]).columns.tolist()
numeric_cols = tweet_features[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

In [11]:
agg_dict = {}

for c in bool_cols:
    agg_dict[c] = "mean"

for c in numeric_cols:
    agg_dict[c] = ["mean", "std"]

In [12]:
user_features_from_tweets = (
    tweet_features
    .groupby("author_id")
    .agg(agg_dict)
)

In [13]:
user_features_from_tweets.columns = [
    f"{col}_{stat}" if isinstance(stat, str) else col
    for col, stat in user_features_from_tweets.columns
]
user_features_from_tweets.reset_index(inplace=True)
user_features_from_tweets.head()

Unnamed: 0,author_id,is_reply_mean,is_sensitive_mean,tweet_is_present_mean,tweet_contains_mention_mean,tweet_contains_url_mean,tweet_contains_bot_word_or_hashtag_mean,tweet_contains_ai_hashtag_mean,tweet_starts_with_emoji_mean,tweet_ends_with_emoji_mean,tweet_starts_with_url_mean,tweet_ends_with_url_mean,tweet_contains_pipe_or_bullet_mean,tweet_contains_call_to_action_mean,tweet_contains_ai_phrase_mean,tweet_contains_repeated_chars_mean,tweet_is_retweet_mean,tweet_is_quote_mean,quote_count_missing_mean,reply_count_missing_mean,like_count_mean,like_count_std,quote_count_mean,quote_count_std,reply_count_mean,reply_count_std,retweet_count_mean,retweet_count_std,tweet_length_mean,tweet_length_std,tweet_num_words_mean,tweet_num_words_std,tweet_num_sentences_mean,tweet_num_sentences_std,tweet_avg_sentence_length_mean,tweet_avg_sentence_length_std,tweet_avg_word_length_mean,tweet_avg_word_length_std,tweet_std_word_length_mean,tweet_std_word_length_std,tweet_unique_word_ratio_mean,tweet_unique_word_ratio_std,tweet_guiraud_index_mean,tweet_guiraud_index_std,tweet_repetition_ratio_mean,tweet_repetition_ratio_std,tweet_hapax_ratio_mean,tweet_hapax_ratio_std,tweet_digit_ratio_mean,tweet_digit_ratio_std,tweet_uppercase_ratio_mean,tweet_uppercase_ratio_std,tweet_lowercase_ratio_mean,tweet_lowercase_ratio_std,tweet_special_char_ratio_mean,tweet_special_char_ratio_std,tweet_punctuation_ratio_mean,tweet_punctuation_ratio_std,tweet_whitespace_ratio_mean,tweet_whitespace_ratio_std,tweet_emoji_count_mean,tweet_emoji_count_std,tweet_emoji_ratio_mean,tweet_emoji_ratio_std,tweet_mention_count_mean,tweet_mention_count_std,tweet_url_count_mean,tweet_url_count_std,tweet_hashtag_count_mean,tweet_hashtag_count_std,tweet_cashtag_count_mean,tweet_cashtag_count_std,tweet_email_count_mean,tweet_email_count_std,tweet_sentiment_mean,tweet_sentiment_std,tweet_sentiment_abs_mean,tweet_sentiment_abs_std,tweet_sentiment_neutrality_mean,tweet_sentiment_neutrality_std,tweet_sentiment_subjectivity_mean,tweet_sentiment_subjectivity_std,tweet_flesch_reading_ease_mean,tweet_flesch_reading_ease_std,tweet_flesch_kincaid_grade_mean,tweet_flesch_kincaid_grade_std,tweet_avg_syllables_per_word_mean,tweet_avg_syllables_per_word_std,tweet_polysyllabic_word_ratio_mean,tweet_polysyllabic_word_ratio_std,tweet_char_entropy_mean,tweet_char_entropy_std,tweet_word_entropy_mean,tweet_word_entropy_std,tweet_avg_word_repetition_mean,tweet_avg_word_repetition_std,tweet_compression_ratio_mean,tweet_compression_ratio_std,tweet_function_word_ratio_mean,tweet_function_word_ratio_std,tweet_noun_ratio_mean,tweet_noun_ratio_std,tweet_verb_ratio_mean,tweet_verb_ratio_std,tweet_pronoun_ratio_mean,tweet_pronoun_ratio_std,tweet_adjective_ratio_mean,tweet_adjective_ratio_std
0,u1000016040288555009,0.0,0.0,1.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,133.25,8.845903,19.0,2.160247,2.0,0.816497,11.125,6.005206,4.839035,0.909734,2.312111,1.070755,0.9875,0.025,4.297501,0.242172,0.0125,0.025,0.975,0.05,0.041293,0.009071,0.113984,0.026531,0.886016,0.026531,0.110613,0.045812,0.076361,0.029509,0.148557,0.012375,1.5,1.290994,0.011413,0.009693,1.0,0.0,0.25,0.5,1.25,1.258306,0.25,0.5,0.0,0.0,0.272625,0.56222,0.465575,0.355214,0.534425,0.355214,0.29375,0.196659,62.736845,18.884894,7.83152,3.069916,1.542819,0.230711,0.100063,0.099979,4.753024,0.065509,4.215543,0.162067,1.013158,0.026316,1.023679,0.065583,0.183968,0.018181,0.409156,0.108177,0.169619,0.087152,0.038221,0.050016,0.108882,0.086652
1,u1000028117430304769,0.0,0.0,1.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8152.5,5193.699308,88.5,44.547727,12.5,9.192388,2.0,1.414214,6.166667,0.235702,4.429825,1.277754,1.869666,0.78777,0.921053,0.111648,3.060071,0.863492,0.078947,0.111648,0.842105,0.223297,0.038816,0.019538,0.671053,0.465202,0.328947,0.465202,0.175658,0.024501,0.098904,0.033805,0.135526,0.067609,1.5,2.12132,0.0125,0.017678,1.5,0.707107,0.5,0.707107,0.0,0.0,0.5,0.707107,0.0,0.0,-0.11315,0.160018,0.11315,0.160018,0.88685,0.160018,0.227273,0.321412,64.4975,34.517418,6.7475,4.111826,1.5625,0.441942,0.166667,0.235702,4.569622,0.285808,3.25855,0.952597,1.09375,0.132583,1.10636,0.026361,0.078947,0.111648,0.574561,0.365959,0.188596,0.031013,0.026316,0.037216,0.078947,0.111648
2,u100003281,0.0,0.0,1.0,0.05,0.95,0.0,0.0,0.0,0.0,0.0,0.95,0.1,0.05,0.0,0.05,0.05,0.0,0.95,0.95,1.8,1.542384,0.0,0.0,0.0,0.0,0.45,0.604805,231.45,43.952696,37.15,8.430989,2.05,1.276302,23.95,12.751688,4.458438,0.231612,2.055153,0.228171,0.862803,0.058749,5.216955,0.699704,0.137197,0.058749,0.734063,0.116732,0.011512,0.01078,0.107876,0.033497,0.892124,0.033497,0.080183,0.031146,0.047095,0.014725,0.168763,0.012925,0.95,0.887041,0.004189,0.003905,0.15,0.67082,1.8,0.523148,0.1,0.307794,0.0,0.0,0.0,0.0,0.58954,0.273813,0.58954,0.273813,0.41046,0.273813,0.474646,0.26078,55.164534,14.692166,12.949087,5.239014,1.436185,0.086451,0.086289,0.042858,4.681185,0.145252,4.89155,0.386064,1.1644,0.083601,0.841447,0.069634,0.198033,0.058156,0.327072,0.082513,0.156557,0.062121,0.078623,0.045159,0.140664,0.049233
3,u1000034604756160513,0.1,0.0,1.0,0.95,0.2,0.0,0.0,0.1,0.0,0.0,0.2,0.0,0.1,0.0,0.05,0.75,0.0,0.65,0.65,0.1,0.307794,0.0,0.0,0.0,0.0,2.7,3.079645,136.05,34.010022,20.3,4.725073,1.45,0.510418,15.275,5.439802,4.616721,0.533013,2.518966,0.726887,0.879624,0.079188,3.922007,0.421392,0.120376,0.079188,0.797536,0.122629,0.014028,0.018947,0.196816,0.058559,0.803184,0.058559,0.085788,0.022112,0.065956,0.02363,0.157819,0.016402,0.7,0.923381,0.004744,0.005997,2.4,1.602629,0.25,0.55012,0.75,0.910465,0.0,0.0,0.0,0.0,0.37855,0.29625,0.37855,0.29625,0.62145,0.29625,0.252083,0.307979,54.411256,20.545528,9.787742,3.953668,1.602838,0.204797,0.143529,0.080543,4.625432,0.151043,4.027786,0.297357,1.14551,0.101628,0.943362,0.073114,0.169017,0.05687,0.36434,0.068252,0.171425,0.070249,0.050542,0.047805,0.105219,0.047535
4,u1000038151036067841,0.05,0.0,1.0,0.95,0.9,0.0,0.0,0.0,0.05,0.0,0.15,0.0,0.25,0.0,0.1,0.75,0.0,1.0,1.0,3.35,8.267088,0.0,0.0,0.0,0.0,17.4,10.049352,124.35,24.940508,14.2,7.804182,1.05,0.223607,13.65,7.610761,4.928103,0.661386,2.115222,0.543755,0.934021,0.137718,3.375822,0.689895,0.065979,0.137718,0.912495,0.176411,0.087697,0.050187,0.321411,0.162291,0.678589,0.162291,0.169248,0.043763,0.093674,0.028129,0.100979,0.033773,0.9,1.97084,0.006897,0.015718,1.9,2.403944,1.1,0.640723,0.05,0.223607,0.0,0.0,0.0,0.0,0.1077,0.365068,0.1963,0.323917,0.8037,0.323917,0.206067,0.343384,50.097206,15.014933,10.085141,2.603673,1.668529,0.202873,0.190926,0.116967,5.041302,0.352019,3.500228,0.480401,1.101825,0.223428,1.020124,0.15588,0.041495,0.05069,0.663236,0.171888,0.107934,0.109061,0.008034,0.024964,0.11151,0.080845


In [14]:
user_features_1 = user_features.merge(
    user_features_from_tweets,
    left_on="id",
    right_on="author_id",
    how="left"
)
#user_features_1.head(1)

In [None]:
tweet_features["created_at"] = pd.to_datetime(
    tweet_features["created_at"],
    errors="coerce",
    utc=True
)

#tweet_features_1 = add_tweet_temporal_and_semantic_features(tweet_features)
#tweet_features_1.head(1)

In [63]:
import numpy as np

def entropy(x):
    counts = x.value_counts(normalize=True)
    return -(counts * np.log(counts)).sum()

def pct_within_threshold(x, threshold):
    x = x.dropna()
    if len(x) == 0:
        return 0.0
    return (x <= threshold).mean()

from sklearn.linear_model import LinearRegression
import numpy as np

def sentiment_slope(group):
    if len(group) < 2:
        return np.nan

    t = (
        group["created_at"] - group["created_at"].min()
    ).dt.total_seconds().values.reshape(-1, 1)

    y = group["tweet_sentiment"].values

    model = LinearRegression()
    model.fit(t, y)
    return model.coef_[0]

In [None]:
test = test.sort_values(["author_id", "created_at"])

user_temporal_features = (
    test
    .groupby("author_id")
    .agg(
        weekend_post_ratio=("tweet_is_weekend", "mean"),
        mean_posting_hour=("tweet_hour", "mean"),
        posting_hour_std=("tweet_hour", "std"),
        posting_hour_entropy=("tweet_hour", entropy),
        posting_dow_entropy=("tweet_day_of_week", entropy),
    )
    .fillna(0)
)

user_delta_stats = (
    test
    .groupby("author_id")["tweet_time_delta_sec"]
    .agg(
        delta_mean="mean",
        delta_median="median",
        delta_std="std",
        delta_min="min",
    )
)

user_delta_stats["delta_burstiness"] = (
    user_delta_stats["delta_std"] / user_delta_stats["delta_mean"]
)

user_fast_posting = (
    test
    .groupby("author_id")["tweet_time_delta_sec"]
    .agg(
        pct_within_10s=lambda x: pct_within_threshold(x, 10),
        pct_within_60s=lambda x: pct_within_threshold(x, 60),
        pct_within_5min=lambda x: pct_within_threshold(x, 300),
    )
)

user_intertweet_features = (
    pd.concat([user_delta_stats, user_fast_posting], axis=1)
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

user_semantic_features = (
    test
    .groupby("author_id")["tweet_cosine_sim_centroid"]
    .agg(
        sem_centroid_sim_mean="mean",
        sem_centroid_sim_max="max",
        sem_centroid_sim_std="std",
    )
    .fillna(0)
)

user_embedding_std_mean = (
    test
    .groupby("author_id")["text_embedding"]
    .apply(lambda x: np.std(np.vstack(x), axis=0).mean())
    .rename("embedding_std_mean")
)

user_embedding_features = user_embedding_std_mean.to_frame()

user_activity = (
    test
    .groupby("author_id")
    .agg(
        retweet_to_tweet_ratio=("tweet_is_retweet", "mean"),
        reply_to_tweet_ratio=("is_reply", "mean"),
        quote_to_tweet_ratio=("tweet_is_quote", "mean"),
    )
    .fillna(0)
)

user_readability_std = (
    test
    .groupby("author_id")["tweet_flesch_reading_ease"]
    .std()
    .rename("readability_std_over_time")
    .fillna(0)
)

user_sentiment_slope = (
    test
    .groupby("author_id")
    .apply(sentiment_slope)
    .rename("sentiment_time_slope")
    .fillna(0)
)

user_style_features = pd.concat(
    [user_readability_std, user_sentiment_slope],
    axis=1
)

tweet_rate = (
    test
    .groupby("author_id")
    .size()
    .rename("n_tweets")
)

account_span_days = (
    test
    .groupby("author_id")["created_at"]
    .agg(lambda x: (x.max() - x.min()).days + 1)
    .rename("active_days")
)

user_tweet_rate = (tweet_rate / account_span_days).rename("tweet_rate")

user_last_tweet_date = (
    test
    .groupby("author_id")["created_at"]
    .max()
)

#user_features["account_age_days"] = (
#    user_last_tweet_date - user_features["account_created_at"]
#).dt.days

#user_features["tweet_rate_x_account_age"] = (
#    user_tweet_rate * user_features["account_age_days"]
#)

user_hashtag_ratio = (
    test
    .groupby("author_id")["tweet_hashtag_count"]
    .mean()
    .rename("avg_hashtag_count")
)

user_features["followers_x_hashtag_ratio"] = (
    user_features["followers_count"] * user_hashtag_ratio
)

user_emoji_ratio = (
    test
    .groupby("author_id")["tweet_emoji_ratio"]
    .mean()
    .rename("avg_emoji_ratio")
)

user_features["verified_x_emoji"] = (
    user_features["is_verified"] * user_emoji_ratio
)

user_features = user_features.join(
    [
        user_temporal_features,
        user_intertweet_features,
        user_semantic_features,
        user_embedding_features,
        user_activity,
        user_style_features,
        user_tweet_rate,
    ],
    how="left"
)

user_features = user_features.fillna(0)

In [None]:
import joblib

joblib.dump(tweet_features, f"../../02_data/tweet_features_2.joblib")

In [None]:
user_features_test = user_features.drop(columns="desc_embedding")

In [19]:
user_features_test.head(1)

Unnamed: 0,id,name_length,username_length,username_name_length_ratio,description,has_name,has_username,has_url,has_location,has_pinned_tweet,has_bot_word_in_name,ratio_digits_in_name,ratio_digits_in_username,ratio_special_chars_in_name,ratio_special_chars_in_username,name_upper_to_lower_ratio,username_upper_to_lower_ratio,name_entropy,username_entropy,username_name_levenshtein,is_protected,is_verified,created_at,account_age_seconds,followers_count,following_count,listed_count,tweet_count,followers_over_following,double_followers_over_following,following_over_followers,following_over_followers_squared,following_over_total_connections,listed_over_followers,tweets_over_followers,listed_over_tweets,follower_rate,following_rate,listed_rate,tweet_rate,label,desc_is_present,desc_length,desc_num_words,desc_num_sentences,desc_avg_sentence_length,desc_avg_word_length,desc_std_word_length,desc_unique_word_ratio,desc_guiraud_index,desc_repetition_ratio,desc_hapax_ratio,desc_digit_ratio,desc_uppercase_ratio,desc_lowercase_ratio,desc_special_char_ratio,desc_punctuation_ratio,desc_whitespace_ratio,desc_emoji_count,desc_emoji_ratio,desc_mention_count,desc_contains_mention,desc_url_count,desc_contains_url,desc_hashtag_count,desc_cashtag_count,desc_email_count,desc_contains_bot_word_or_hashtag,desc_contains_ai_hashtag,desc_sentiment,desc_sentiment_abs,desc_sentiment_neutrality,desc_sentiment_subjectivity,desc_flesch_reading_ease,desc_flesch_kincaid_grade,desc_avg_syllables_per_word,desc_polysyllabic_word_ratio,desc_char_entropy,desc_word_entropy,desc_avg_word_repetition,desc_compression_ratio,desc_starts_with_emoji,desc_ends_with_emoji,desc_starts_with_url,desc_ends_with_url,desc_contains_pipe_or_bullet,desc_contains_call_to_action,desc_contains_ai_phrase,desc_function_word_ratio,desc_noun_ratio,desc_verb_ratio,desc_pronoun_ratio,desc_adjective_ratio,desc_contains_repeated_chars,desc_is_retweet,desc_is_quote,description_normalized
0,u1000115670657318912,4,6,1.5,"Open source tool for data & models versioning for ML projects. Join our stellar community https://t.co/vBp8rcV4bf for help, support and insights.",True,True,False,True,True,False,0.0,0.0,0.25,0.0,3.0,1.0,2.0,2.584963,0.666667,False,False,2018-05-25 20:45:31+00:00,241981329,3488,325,79,911,10.732308,21.464615,0.093177,2.7e-05,0.085235,0.022649,0.261181,0.086718,1.4e-05,1e-06,3.264715e-07,4e-06,0,True,127.0,20.0,2.0,10.0,5.05,2.312466,0.9,4.024922,0.1,0.85,0.0,0.069307,0.930693,0.047244,0.047244,0.15748,0.0,0.0,0.0,False,1.0,True,0.0,0.0,0.0,False,False,0.765,0.765,0.235,0.375,57.095,7.78,1.65,0.1,4.407766,4.084184,1.111111,0.889764,False,False,False,False,False,True,False,0.2,0.5,0.1,0.05,0.15,False,,,"Open source tool for data & models versioning for ML projects . Join our stellar community HTTPURL for help , support and insights ."


In [22]:
import pandas as pd

edges = pd.read_csv("../../02_data/edge.csv")

In [23]:
edges.head(1)

Unnamed: 0,source_id,relation,target_id
0,u980749991491682304,followers,u1480979504696864775


In [24]:
edges["relation"].unique()

array(['followers', 'following', 'own', 'pinned', 'post', 'contain',
       'discuss', 'mentioned', 'like', 'followed', 'replied_to',
       'retweeted', 'quoted', 'membership'], dtype=object)

In [None]:
edges_1 = edges[edges["source_id"].isin(user_features["id"].unique())]
edges_1 = edges_1[edges_1["target_id"].isin(user_features["id"].unique())]
edges_1.shape