In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
#!apt-get install openjdk-17-jdk-headless -qq > /dev/null # OpenJDK 17
#!wget --show-progress https://dlcdn.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz # Apache Spark 3.5.5 with Hadoop 3 support
#!tar xf spark-3.5.6-bin-hadoop3.tgz
#!pip install findspark

In [3]:
import os
import findspark
from pyspark.sql import SparkSession
import pandas as pd

In [4]:
# Set up Spark
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-17-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/spark-3.5.6-bin-hadoop3'

findspark.init()
findspark.find()

spark = SparkSession.builder.appName('TwiBot22').getOrCreate()
spark

In [6]:
!pip install ijson

Collecting ijson
  Downloading ijson-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading ijson-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (148 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.3/148.3 kB[0m [31m620.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.4.0


In [13]:
import json, ijson, decimal

in_path = "/content/drive/MyDrive/TwiBot-22/tweet_0.json"
out_path = "/content/tweet_0_small.jsonl"

# helper: convert Decimal to float (or int if safe)
def json_default(o):
    if isinstance(o, decimal.Decimal):
        # try int if no fractional part, else float
        if o % 1 == 0:
            return int(o)
        return float(o)
    raise TypeError(f"Object of type {o.__class__.__name__} is not JSON serializable")

with open(in_path, "r") as f, open(out_path, "w") as out:
    parser = ijson.items(f, "item")   # iterate array elements
    for i, obj in enumerate(parser):
        if i >= 10000:   # keep only first 10k
            break
        out.write(json.dumps(obj, default=json_default) + "\n")

print("✅ Done. Written to", out_path)

✅ Done. Written to /content/tweet_0_small.jsonl


In [15]:
df = spark.read.json(out_path)
df.show(5, truncate=False)

+-----------+-------------------+-------------------+-------------------+-------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------------------+-------------------+----+------------------+--------------------+-----------------+--------------+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
tweet_0 = spark.read.json("/content/drive/MyDrive/TwiBot-22/tweet_0.json", multiLine=True)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
tweet_0.printSchema()

In [None]:
#user = spark.read.json("/content/drive/MyDrive/TwiBot-22/user.json", multiLine=True)

In [None]:
#user.printSchema()

In [None]:
label = pd.read_csv("/content/drive/MyDrive/TwiBot-22/label.csv")

In [None]:
n_rows, n_columns = label.shape
print(f"The dataset contains {n_rows} rows and {n_columns} columns.")
print("\nFirst few rows of the dataset:")
print(label.head())
print("\nColumn data types:")
print(label.dtypes)

The dataset contains 1000000 rows and 2 columns.

First few rows of the dataset:
                     id  label
0  u1217628182611927040  human
1           u2664730894  human
2  u1266703520205549568  human
3  u1089159225148882949  human
4             u36741729    bot

Column data types:
id       object
label    object
dtype: object


In [None]:
human_perc = (label['label'] == 'human').sum()/n_rows * 100
bot_perc = 100 - human_perc
print(f"Percentage of human accounts: {human_perc:.2f}")
print(f"Percentage of bot accounts: {bot_perc:.2f}")

Percentage of human accounts: 86.01
Percentage of bot accounts: 13.99


In [None]:
unique_authors = tweet_0.select("author_id").distinct()
print("Unique authors:", unique_authors.count())

Unique authors: 314813


In [None]:
author_ids = [f"u{row['author_id']}" for row in unique_authors.collect()]

In [None]:
label_selected = label[label["id"].isin(author_ids)]

In [None]:
label_selected

Unnamed: 0,id,label
0,u1217628182611927040,human
8,u15211869,human
16,u2465283662,bot
22,u284870222,human
27,u83389771,human
...,...,...
999995,u1151138281,human
999996,u1339035361,human
999997,u318636852,human
999998,u43443354,human


In [None]:
n_rows_selected, n_columns_selected = label_selected.shape

human_perc_selected = (label_selected['label'] == 'human').sum()/n_rows_selected * 100
bot_perc_selected = 100 - human_perc_selected
print(f"Percentage of human accounts (selected): {human_perc_selected:.2f}")
print(f"Percentage of bot accounts (selected): {bot_perc_selected:.2f}")

Percentage of human accounts (selected): 92.46
Percentage of bot accounts (selected): 7.54


In [None]:
tweet_0_selected = tweet_0.select("id", "author_id", "text", "created_at", "lang")

In [None]:
tweet_0_selected.printSchema()

root
 |-- id: string (nullable = true)
 |-- author_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- lang: string (nullable = true)

