<a href="https://colab.research.google.com/github/julwdo/NLP-project/blob/main/NLP_Project_JW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!apt-get install openjdk-17-jdk-headless -qq > /dev/null # OpenJDK 17
#!wget --show-progress https://dlcdn.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz # Apache Spark 3.5.5 with Hadoop 3 support
#!tar xf spark-3.5.6-bin-hadoop3.tgz
!pip install findspark sparknlp



In [2]:
import os
#import findspark
#from pyspark.sql import SparkSession
import sparknlp
from google.colab import auth
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
from nltk.sentiment import SentimentIntensityAnalyzer
import math
from collections import Counter
from pyspark.sql import Window
from pyspark.sql.types import StringType, NumericType
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import XlmRoBertaSentenceEmbeddings
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [3]:
#import nltk
#nltk.download('vader_lexicon')

In [4]:
# Set up Spark
#os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-17-openjdk-amd64'
#os.environ['SPARK_HOME'] = '/content/spark-3.5.6-bin-hadoop3'

#findspark.init()
#indspark.find()

#spark = SparkSession.builder.appName('TwiBot22').getOrCreate()

spark = sparknlp.start(gpu=False)

In [5]:
auth.authenticate_user()

In [6]:
#!gcloud init

In [7]:
bucket_name = "twibot-22"
file_names = ["user.jsonl", "label.csv", "tweet_0.jsonl"]

for file_name in file_names:
    local_path = f"/content/{file_name}"
    if not os.path.exists(local_path):
        !gsutil cp gs://{bucket_name}/{file_name} {local_path}
    else:
        print(f"{file_name} already exists locally, skipping download.")

user.jsonl already exists locally, skipping download.
label.csv already exists locally, skipping download.
tweet_0.jsonl already exists locally, skipping download.


In [8]:
users = spark.read.json(f"/content/user.jsonl")

users.printSchema()

root
 |-- created_at: string (nullable = true)
 |-- description: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- description: struct (nullable = true)
 |    |    |-- cashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |-- tag: string (nullable = true)
 |    |    |-- hashtags: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |-- tag: string (nullable = true)
 |    |    |-- mentions: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- end: long (nullable = true)
 |    |    |    |    |-- start: long (nullable = true)
 |    |    |    |    |-- username: string (nullable = true)
 |    |

In [9]:
#users.show(5, truncate=False)

In [10]:
users_selected = users.select(
    F.col("id"),
    F.col("name"),
    F.col("username"),
    F.col("created_at"),
    F.col("description"),
    F.col("url"),
    F.col("entities.description.cashtags"),
    F.col("entities.description.hashtags"),
    F.col("entities.description.mentions"),
    F.col("entities.description.urls"),
    F.col("location"),
    F.col("pinned_tweet_id"),
    F.col("profile_image_url"),
    F.col("protected"),
    F.col("public_metrics.followers_count"),
    F.col("public_metrics.following_count"),
    F.col("public_metrics.listed_count"),
    F.col("public_metrics.tweet_count"),
    F.col("verified")
    )

In [11]:
users_selected.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- username: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- cashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- end: long (nullable = true)
 |    |    |-- start: long (nullable = true)
 |    |    |-- tag: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- end: long (nullable = true)
 |    |    |-- start: long (nullable = true)
 |    |    |-- tag: string (nullable = true)
 |-- mentions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- end: long (nullable = true)
 |    |    |-- start: long (nullable = true)
 |    |    |-- username: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- di

In [12]:
#users_selected.show(5, truncate=False)

In [13]:
labels = spark.read.csv(f"/content/label.csv", header=True, inferSchema=True)

In [14]:
labels.show(5, truncate=False)

+--------------------+-----+
|id                  |label|
+--------------------+-----+
|u1217628182611927040|human|
|u2664730894         |human|
|u1266703520205549568|human|
|u1089159225148882949|human|
|u36741729           |bot  |
+--------------------+-----+
only showing top 5 rows



In [15]:
users_labeled = users_selected.join(labels, users_selected.id == labels.id, "left").drop(labels.id)

In [16]:
#users_labeled.show(5, truncate=False)

In [17]:
#print('Summary of missing values:')
#users_labeled.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in users_labeled.columns]).show()

In [18]:
sia = SentimentIntensityAnalyzer()

def vader_sentiment(text):
  return sia.polarity_scores(text)["compound"]

vader_udf = F.udf(vader_sentiment, FloatType())

In [19]:
def shannon_entropy(string):
    if string.strip() == "":
        return 0.0
    counts = Counter(string)
    length = len(string)
    return -sum((count/length) * math.log2(count/length) for count in counts.values())

entropy_udf = F.udf(shannon_entropy, FloatType())

In [20]:
now = F.current_timestamp()

In [21]:
user_features = users_labeled.select(
    F.col("id"),
    F.length(F.col("name")).alias("name_length"),
    F.length(F.col("username")).alias("username_length"),
    (F.length(F.col("username")) / F.greatest(F.length(F.col("name")), F.lit(1))).alias("username_name_length_ratio"),
    F.regexp_replace(F.regexp_replace(F.regexp_replace(F.col("description"), r"https?://t\.co/\S+", "<URL>"), r"(?<=^|\s)@\w+", "<USER>"), r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "<EMAIL>").alias("description"),
    F.length(F.col("description")).alias("description_length"),
    F.when(F.col("name") == "", False).otherwise(True).alias("has_name"),
    F.when(F.col("username") == "", False).otherwise(True).alias("has_username"),
    F.when(F.col("description") == "", False).otherwise(True).alias("has_description"),
    F.when(F.col("url") == "", False).otherwise(True).alias("has_url"),
    F.when(F.col("location").isNull() | (F.col("location") == ""), False).otherwise(True).alias("has_location"),
    F.when(F.col("pinned_tweet_id").isNull(), False).otherwise(True).alias("has_pinned_tweet"),
    F.col("name").rlike("(?i)\\bbot\\b").alias("has_bot_word_in_name"),
    F.col("description").rlike("(?i)\\bbot\\b").alias("has_bot_word_in_description"),
    (F.length(F.regexp_replace(F.col("name"), "[^\\d]", "")) / F.greatest(F.length(F.col("name")), F.lit(1))).alias("ratio_digits_in_name"),
    (F.length(F.regexp_replace(F.col("username"), "[^\\d]", "")) / F.greatest(F.length(F.col("username")), F.lit(1))).alias("ratio_digits_in_username"),
    (F.length(F.regexp_replace(F.col("description"), "[^\\d]", "")) / F.greatest(F.length(F.col("description")), F.lit(1))).alias("ratio_digits_in_description"),
    (F.length(F.regexp_replace(F.col("name"), "[A-Za-z0-9 ]", "")) / F.greatest(F.length(F.col("name")), F.lit(1))).alias("ratio_special_chars_in_name"),
    (F.length(F.regexp_replace(F.col("username"), "[A-Za-z0-9 ]", "")) / F.greatest(F.length(F.col("username")), F.lit(1))).alias("ratio_special_chars_in_username"),
    (F.length(F.regexp_replace(F.col("description"), "[A-Za-z0-9 ]", "")) / F.greatest(F.length(F.col("description")), F.lit(1))).alias("ratio_special_chars_in_description"),
    (F.length(F.regexp_replace(F.col("name"), "[^A-Z]", "")) / F.greatest(F.length(F.regexp_replace(F.col("name"), "[^a-z]", "")), F.lit(1))).alias("name_upper_to_lower_ratio"),
    (F.length(F.regexp_replace(F.col("username"), "[^A-Z]", "")) / F.greatest(F.length(F.regexp_replace(F.col("username"), "[^a-z]", "")), F.lit(1))).alias("username_upper_to_lower_ratio"),
    entropy_udf(F.col("name")).alias("name_entropy"),
    entropy_udf(F.col("username")).alias("username_entropy"),
    (F.levenshtein(F.col("username"), F.col("name")) / F.greatest(F.length(F.col("username")), F.length(F.col("name")), F.lit(1))).alias("username_name_levenshtein"),
    vader_udf(F.col("description")).alias("description_sentiment"),
    F.when(F.col("cashtags").isNotNull(), F.size(F.col("cashtags"))).otherwise(F.lit(0)).alias("cashtag_in_description_count"),
    F.when(F.col("hashtags").isNotNull(), F.size(F.col("hashtags"))).otherwise(F.lit(0)).alias("hashtag_in_description_count"),
    F.when(F.col("mentions").isNotNull(), F.size(F.col("mentions"))).otherwise(F.lit(0)).alias("mention_in_description_count"),
    F.when(F.col("urls").isNotNull(), F.size(F.col("urls"))).otherwise(F.lit(0)).alias("url_in_description_count"),
    F.col("protected").alias("is_protected"),
    F.col("verified").alias("is_verified"),
    (F.unix_timestamp(now) - F.unix_timestamp(F.to_timestamp("created_at"))).alias("account_age_seconds"),
    F.col("followers_count"),
    F.col("following_count"),
    F.col("listed_count"),
    F.col("tweet_count"),
    (F.col("followers_count") / F.greatest(F.col("following_count"), F.lit(1))).alias("followers_over_following"),
    (2 * F.col("followers_count") / F.greatest(F.col("following_count"), F.lit(1))).alias("double_followers_over_following"),
    (F.col("following_count") / F.greatest(F.col("followers_count"), F.lit(1))).alias("following_over_followers"),
    (F.col("following_count") / F.greatest(F.col("followers_count") ** 2, F.lit(1))).alias("following_over_followers_squared"),
    (F.col("following_count") / F.greatest(F.col("followers_count") + F.col("following_count"), F.lit(1))).alias("following_over_total_connections"),
    (F.col("listed_count") / F.greatest(F.col("followers_count"), F.lit(1))).alias("listed_over_followers"),
    (F.col("tweet_count") / F.greatest(F.col("followers_count"), F.lit(1))).alias("tweets_over_followers"),
    (F.col("listed_count") / F.greatest(F.col("tweet_count"), F.lit(1))).alias("listed_over_tweets"),
    (F.col("followers_count") / (F.unix_timestamp(now) - F.unix_timestamp(F.to_timestamp("created_at")))).alias("follower_rate"),
    (F.col("following_count") / (F.unix_timestamp(now) - F.unix_timestamp(F.to_timestamp("created_at")))).alias("following_rate"),
    (F.col("listed_count") / (F.unix_timestamp(now) - F.unix_timestamp(F.to_timestamp("created_at")))).alias("listed_rate"),
    (F.col("tweet_count") / (F.unix_timestamp(now) - F.unix_timestamp(F.to_timestamp("created_at")))).alias("tweet_rate"),
    F.col("label")
    )

In [22]:
#user_features.show(5, truncate=False)

In [23]:
tweets = spark.read.json(f"/content/tweet_0.jsonl")

tweets.printSchema()

root
 |-- attachments: struct (nullable = true)
 |    |-- media_keys: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- poll_ids: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- author_id: long (nullable = true)
 |-- context_annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- domain: struct (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- entity: struct (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |-- conversation_id: long (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- annotations: array (nullable = true)
 |    |    |-- element: struct (c

In [24]:
#tweets.show(5, truncate=False)

In [25]:
tweets_selected = tweets.select(
    F.col("id"),
    F.regexp_replace(F.regexp_replace(F.regexp_replace(F.col("text"), r"https?://t\.co/\S+", "<URL>"), r"(?<=^|\s)@\w+", "<USER>"), r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "<EMAIL>").alias("text"),
    F.concat(F.lit("u"), F.col("author_id")).alias("author_id"),
    F.col("created_at"),
    F.when(F.col("in_reply_to_user_id").isNull(), False).otherwise(True).cast("int").alias("is_reply"),
    F.col("lang"),
    F.col("possibly_sensitive").cast("int").alias("is_sensitive"),
    F.col("public_metrics.like_count"),
    F.col("public_metrics.quote_count"),
    F.col("public_metrics.reply_count"),
    F.col("public_metrics.retweet_count")
    )

In [26]:
tweets_selected.printSchema()

root
 |-- id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- is_reply: integer (nullable = false)
 |-- lang: string (nullable = true)
 |-- is_sensitive: integer (nullable = true)
 |-- like_count: long (nullable = true)
 |-- quote_count: long (nullable = true)
 |-- reply_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)



In [27]:
#tweets_selected.show(5, truncate=False)

In [28]:
#print('Summary of missing values:')
#tweets_selected.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in tweets_selected.columns]).show()

In [29]:
# Truncate each tweet to 20 tokens
def truncate_text(text, max_tokens=20):
    tokens = text.split()
    return " ".join(tokens[:max_tokens])

truncate_udf = F.udf(lambda x: truncate_text(x, 20), StringType())

tweets_truncated = tweets_selected.withColumn(
    "text_truncated", truncate_udf("text")
)

In [30]:
#tweets_truncated.show(1, truncate=False)

In [31]:
# Filter top tweets per author
top_n = 10

window = Window.partitionBy("author_id").orderBy(F.col("created_at").desc())
tweets_truncated = tweets_truncated.withColumn("rank", F.row_number().over(window))
tweets_filtered = tweets_truncated.filter(F.col("rank") <= top_n).drop("rank")

In [32]:
text_column = "text"

tweet_features = tweets_filtered.groupBy("author_id").agg(
    F.concat_ws(" ", F.collect_list(text_column)).alias("tweets_last20_concatenated_text"),
    F.avg(F.col("is_reply")).alias("tweets_last20_reply_fraction"),
    F.countDistinct("lang").alias("tweets_last20_num_distinct_langs"),
    F.avg(F.col("is_sensitive")).alias("tweets_last20_sensitive_fraction"),
    F.avg(F.col("like_count")).alias("tweets_last20_avg_likes"),
    F.avg(F.col("quote_count")).alias("tweets_last20_avg_quotes"),
    F.avg(F.col("reply_count")).alias("tweets_last20_avg_replies"),
    F.avg(F.col("retweet_count")).alias("tweets_last20_avg_retweets")
    )

In [33]:
# Join user features with aggregated tweet features
enriched_user_features = user_features.join(
    tweet_features,
    user_features.id == tweet_features.author_id,
    how="inner"
).drop("author_id")

In [34]:
enriched_user_features.printSchema()

root
 |-- id: string (nullable = true)
 |-- name_length: integer (nullable = true)
 |-- username_length: integer (nullable = true)
 |-- username_name_length_ratio: double (nullable = true)
 |-- description: string (nullable = true)
 |-- description_length: integer (nullable = true)
 |-- has_name: boolean (nullable = false)
 |-- has_username: boolean (nullable = false)
 |-- has_description: boolean (nullable = false)
 |-- has_url: boolean (nullable = false)
 |-- has_location: boolean (nullable = false)
 |-- has_pinned_tweet: boolean (nullable = false)
 |-- has_bot_word_in_name: boolean (nullable = true)
 |-- has_bot_word_in_description: boolean (nullable = true)
 |-- ratio_digits_in_name: double (nullable = true)
 |-- ratio_digits_in_username: double (nullable = true)
 |-- ratio_digits_in_description: double (nullable = true)
 |-- ratio_special_chars_in_name: double (nullable = true)
 |-- ratio_special_chars_in_username: double (nullable = true)
 |-- ratio_special_chars_in_description: 

In [35]:
#enriched_user_features.show(5, truncate=False)

In [36]:
# Set the fraction of data to use (e.g., 0.5 for 50%)
DATA_FRACTION = 0.01

assert 0 < DATA_FRACTION <= 1, 'ERROR: DATA_FRACTION must be between 0 and 1 (exclusive).'

In [37]:
# Sample the dataset according to DATA_FRACTION
if DATA_FRACTION < 1:
  print(f'Sampling {DATA_FRACTION * 100:.0f}% of the dataset.')
  enriched_user_features = enriched_user_features.sample(False, DATA_FRACTION, 42)
else:
  print('Using the entire dataset.')

Sampling 1% of the dataset.


In [38]:
text_features = enriched_user_features.select(
    F.col("id"),
    F.col("description"),
    F.col("tweets_last20_concatenated_text")
    )

In [39]:
desc_doc = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("description_doc")

desc_embed = XlmRoBertaSentenceEmbeddings.pretrained("sent_xlm_roberta_base", "xx") \
    .setInputCols(["description_doc"]) \
    .setOutputCol("description_embeddings")

tweets_doc = DocumentAssembler() \
    .setInputCol("tweets_last20_concatenated_text") \
    .setOutputCol("tweets_doc")

tweets_embed = XlmRoBertaSentenceEmbeddings.pretrained("sent_xlm_roberta_base", "xx") \
    .setInputCols(["tweets_doc"]) \
    .setOutputCol("tweets_embeddings")

pipeline = Pipeline(stages=[desc_doc, desc_embed, tweets_doc, tweets_embed])

model = pipeline.fit(text_features)
text_embeddings = model.transform(text_features)

sent_xlm_roberta_base download started this may take some time.
Approximate size to download 619.5 MB
[OK!]
sent_xlm_roberta_base download started this may take some time.
Approximate size to download 619.5 MB
[OK!]


In [40]:
text_embeddings.printSchema()

root
 |-- id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- tweets_last20_concatenated_text: string (nullable = false)
 |-- description_doc: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- description_embeddings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |

In [41]:
text_module = text_embeddings.select(
    "id",
    F.col("description_embeddings")[0]["embeddings"].alias("description_vector"),
    F.col("tweets_embeddings")[0]["embeddings"].alias("tweets_vector")
    )

In [42]:
text_module.printSchema()

root
 |-- id: string (nullable = true)
 |-- description_vector: array (nullable = true)
 |    |-- element: float (containsNull = false)
 |-- tweets_vector: array (nullable = true)
 |    |-- element: float (containsNull = false)



In [43]:
user_module = enriched_user_features.drop(
    "description",
    "tweets_last20_concatenated_text"
    )

In [44]:
user_module.printSchema()

root
 |-- id: string (nullable = true)
 |-- name_length: integer (nullable = true)
 |-- username_length: integer (nullable = true)
 |-- username_name_length_ratio: double (nullable = true)
 |-- description_length: integer (nullable = true)
 |-- has_name: boolean (nullable = false)
 |-- has_username: boolean (nullable = false)
 |-- has_description: boolean (nullable = false)
 |-- has_url: boolean (nullable = false)
 |-- has_location: boolean (nullable = false)
 |-- has_pinned_tweet: boolean (nullable = false)
 |-- has_bot_word_in_name: boolean (nullable = true)
 |-- has_bot_word_in_description: boolean (nullable = true)
 |-- ratio_digits_in_name: double (nullable = true)
 |-- ratio_digits_in_username: double (nullable = true)
 |-- ratio_digits_in_description: double (nullable = true)
 |-- ratio_special_chars_in_name: double (nullable = true)
 |-- ratio_special_chars_in_username: double (nullable = true)
 |-- ratio_special_chars_in_description: double (nullable = true)
 |-- name_upper_to

In [76]:
# Train, validation, test split
train_frac = 0.7
val_frac = 0.2
test_frac = 0.1

train_df, val_df, test_df = user_module.randomSplit([train_frac, val_frac, test_frac], seed=42)

In [77]:
# Standardize numerical features
numeric_cols = [
    f.name for f in train_df.schema.fields if isinstance(f.dataType, NumericType)
    ]

w = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

for c in numeric_cols:
    mean_c = F.mean(c).over(w)
    std_c  = F.stddev(c).over(w)

    train_df = train_df.withColumn(f"{c}_scaled", (F.col(c) - mean_c) / std_c)

In [78]:
cols = [
    f.name for f in train_df.schema.fields
    if f.name not in numeric_cols
]

train_df = train_df.select(cols)

In [79]:
train_df.show(5)

+--------------------+--------+------------+---------------+-------+------------+----------------+--------------------+---------------------------+------------+-----------+-----+--------------------+----------------------+---------------------------------+-------------------------+---------------------------+-------------------------------+----------------------------------+----------------------------------+--------------------------------------+-----------------------------------------+--------------------------------+------------------------------------+--------------------+-----------------------+--------------------------------+----------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-------------------------------+--------------------------+----------------------+----------------------+--------------------+--------------------+-------------------------------+--------------------------------------+----