In [1]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps

# 1. Data Loading

In [2]:
# Load the training data from a JSON Lines file (one JSON object per line)
train_data = pd.read_json('../../train.jsonl', lines=True)
# The tweet data is nested. json_normalize flattens the nested JSON into columns.
train_data = json_normalize(train_data.to_dict(orient='records'))

# Load the Kaggle test data (which we will make predictions on)
kaggle_data = pd.read_json('../../kaggle_test.jsonl', lines=True)
# Also normalize the Kaggle data
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))


# Separate features from the target variable for the training set
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_kaggle = kaggle_data

In [4]:
from pandas import json_normalize
import pandas as pd

# Load & normalize
train_data = pd.read_json('../../train.jsonl', lines=True)
train_data = json_normalize(train_data.to_dict(orient='records'))

kaggle_data = pd.read_json('../../kaggle_test.jsonl', lines=True)
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))

# Convert to sets for easy comparison
train_cols  = set(train_data.columns)
kaggle_cols = set(kaggle_data.columns)

# Shared + exclusive
common_cols = sorted(train_cols & kaggle_cols)
only_train  = sorted(train_cols - kaggle_cols)
only_test   = sorted(kaggle_cols - train_cols)

print(f"Number of common columns: {len(common_cols)}")
print("=== Common columns ===")
for c in common_cols:
    print(c)

print(f"\nNumber of columns only in TRAIN: {len(only_train)}")
print("=== Columns only in TRAIN ===")
for c in only_train:
    print(c)

print(f"\nNumber of columns only in TEST: {len(only_test)}")
print("=== Columns only in TEST ===")
for c in only_test:
    print(c)

Number of common columns: 191
=== Common columns ===
challenge_id
contributors
coordinates
coordinates.coordinates
coordinates.type
created_at
display_text_range
entities.hashtags
entities.media
entities.symbols
entities.urls
entities.user_mentions
extended_entities
extended_entities.media
extended_tweet
extended_tweet.display_text_range
extended_tweet.entities.hashtags
extended_tweet.entities.media
extended_tweet.entities.symbols
extended_tweet.entities.urls
extended_tweet.entities.user_mentions
extended_tweet.extended_entities.media
extended_tweet.full_text
favorite_count
favorited
filter_level
geo
geo.coordinates
geo.type
id_str
in_reply_to_screen_name
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
is_quote_status
lang
place
place.bounding_box.coordinates
place.bounding_box.type
place.country
place.country_code
place.full_name
place.id
place.name
place.place_type
place.url
possibly_sensitive
quote_count
quoted_status
quoted_status.contrib

# Create author ids

In [5]:
import pandas as pd
import numpy as np
import re, ast

def parse_tweets(path, expect_label=True):
    # Load & flatten
    df = pd.read_json(path, lines=True)
    df = pd.json_normalize(df.to_dict(orient="records"), sep=".")

    # Ensure expected nested columns exist
    for col in [
        "text", "extended_tweet.full_text", "source",
        "entities.hashtags", "entities.user_mentions", "entities.urls",
        "extended_entities.media",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # Full text (vectorized, avoids apply/axis=1)
    df["full_text"] = df["extended_tweet.full_text"].fillna(df["text"]).fillna("")

    # Engagement (create if missing)
    for col in ["retweet_count", "favorite_count", "reply_count", "quote_count"]:
        if col not in df.columns:
            df[col] = 0

    # Safe length for list-like fields (sometimes lists, sometimes stringified)
    def safe_len(x):
        if isinstance(x, list):
            return len(x)
        if isinstance(x, str):
            try:
                v = ast.literal_eval(x)
                return len(v) if isinstance(v, (list, tuple)) else 1
            except Exception:
                return 0
        return 0

    df["n_hashtags"] = df["entities.hashtags"].apply(safe_len)
    df["n_mentions"] = df["entities.user_mentions"].apply(safe_len)
    df["n_urls"]     = df["entities.urls"].apply(safe_len)

    # Media presence (avoid .get(...).apply on a scalar)
    df["has_media"] = df["extended_entities.media"].apply(lambda x: safe_len(x) > 0)

    import hashlib

    def make_user_key(row):
        key = (
            str(row.get("user.created_at", "")) + "|" +
            str(row.get("user.description", "")) + "|" +
            str(row.get("user.url", "")) + "|" +
            str(row.get("user.location", ""))
        )
        return hashlib.md5(key.encode("utf-8")).hexdigest()

    df["author_pseudo_id"] = df.apply(make_user_key, axis=1)

    # Source app (extract readable name from HTML anchor)
    def extract_source(x):
        if not isinstance(x, str):
            return "Unknown"
        m = re.search(r'>([^<]+)<', x)
        return m.group(1) if m else x

    df["source_app"] = df["source"].apply(extract_source)
    print(f"Our columns: {df.columns.tolist()}")

    # User fields (create if missing)
    for col in [
        "user.description", "user.location",
        "user.favourites_count", "user.statuses_count", "user.listed_count"
    ]:
        
        if col not in df.columns:
            df[col] = np.nan
    df["user.description"] = df["user.description"].fillna("")

    # Keep relevant columns (only those that exist)
    #"lang" -> always french
    keep_cols = [
        "id_str", 
        "author_pseudo_id",
        "full_text", "source_app",
        #"retweet_count", "favorite_count", "reply_count", "quote_count",
        "n_hashtags", "n_mentions", "n_urls", "has_media",
        "user.description", "user.location", "user.favourites_count",
        "user.statuses_count", "user.listed_count",
    ]
    existing = [c for c in keep_cols if c in df.columns]
    out = df[existing].copy()

    # Attach label if expected and available
    if expect_label and "label" in df.columns:
        out["label"] = df["label"]
    elif expect_label and "label" not in df.columns:
        print("Warning: 'label' not found in this file; returning features only.")

    # Optional: show which expected columns were missing
    missing = sorted(set(keep_cols) - set(existing))
    if missing:
        print("Note: missing columns created or omitted:", missing)

    return out

In [None]:
train_clean  = parse_tweets("../../train.jsonl", expect_label=True)

col = "author_pseudo_id"

n_total   = len(train_clean)
n_ok      = train_clean[col].notna().sum()      # non-empty IDs
n_missing = train_clean[col].isna().sum()       # empty/NaN IDs

print("Total rows:        ", n_total)
print("With an ID:        ", n_ok)
print("Missing / empty ID:", n_missing)

# Count distinct authors (ignoring NaN)
n_authors = train_clean[col].nunique(dropna=True)

# Distribution of tweets per author (ignoring NaN)
author_counts = train_clean[col].value_counts(dropna=True)

max_tweets_same_author = author_counts.max() if not author_counts.empty else 0

print("Number of distinct authors:", n_authors)
print("Max tweets from a single author:", max_tweets_same_author)

# (Optional) show the top 5 most prolific authors
print("\nTop 5 authors by tweet count:")
print(author_counts.head(5))

Our columns: ['in_reply_to_status_id_str', 'in_reply_to_status_id', 'created_at', 'in_reply_to_user_id_str', 'source', 'quoted_status_id', 'retweet_count', 'retweeted', 'geo', 'filter_level', 'in_reply_to_screen_name', 'is_quote_status', 'id_str', 'in_reply_to_user_id', 'favorite_count', 'text', 'place', 'lang', 'quote_count', 'favorited', 'coordinates', 'truncated', 'timestamp_ms', 'reply_count', 'quoted_status_id_str', 'contributors', 'challenge_id', 'label', 'extended_tweet', 'display_text_range', 'possibly_sensitive', 'extended_entities', 'withheld_in_countries', 'quoted_status.extended_tweet.entities.urls', 'quoted_status.extended_tweet.entities.hashtags', 'quoted_status.extended_tweet.entities.user_mentions', 'quoted_status.extended_tweet.entities.symbols', 'quoted_status.extended_tweet.full_text', 'quoted_status.extended_tweet.display_text_range', 'quoted_status.in_reply_to_status_id_str', 'quoted_status.in_reply_to_status_id', 'quoted_status.created_at', 'quoted_status.in_reply

In [7]:
test_clean   = parse_tweets("../../kaggle_test.jsonl", expect_label=False)

col = "author_pseudo_id"

n_total   = len(test_clean)
n_ok      = test_clean[col].notna().sum()      # non-empty IDs
n_missing = test_clean[col].isna().sum()       # empty/NaN IDs

print("Total rows:        ", n_total)
print("With an ID:        ", n_ok)
print("Missing / empty ID:", n_missing)

# Count distinct authors (ignoring NaN)
n_authors = test_clean[col].nunique(dropna=True)

# Distribution of tweets per author (ignoring NaN)
author_counts = test_clean[col].value_counts(dropna=True)

max_tweets_same_author = author_counts.max() if not author_counts.empty else 0

print("Number of distinct authors:", n_authors)
print("Max tweets from a single author:", max_tweets_same_author)

# (Optional) show the top 5 most prolific authors
print("\nTop 5 authors by tweet count:")
print(author_counts.head(5))

Our columns: ['in_reply_to_status_id_str', 'in_reply_to_status_id', 'created_at', 'in_reply_to_user_id_str', 'source', 'retweet_count', 'retweeted', 'geo', 'filter_level', 'in_reply_to_screen_name', 'is_quote_status', 'id_str', 'in_reply_to_user_id', 'favorite_count', 'text', 'place', 'lang', 'quote_count', 'favorited', 'possibly_sensitive', 'coordinates', 'truncated', 'timestamp_ms', 'reply_count', 'contributors', 'challenge_id', 'extended_entities', 'display_text_range', 'quoted_status', 'quoted_status_id', 'quoted_status_permalink', 'quoted_status_id_str', 'extended_tweet.entities.urls', 'extended_tweet.entities.hashtags', 'extended_tweet.entities.user_mentions', 'extended_tweet.entities.symbols', 'extended_tweet.full_text', 'extended_tweet.display_text_range', 'entities.urls', 'entities.hashtags', 'entities.user_mentions', 'entities.symbols', 'user.utc_offset', 'user.profile_image_url_https', 'user.listed_count', 'user.profile_background_image_url', 'user.default_profile_image', 'u

In [8]:


col = "author_pseudo_id"

# --- Sets of authors (ignore NaN) ---
train_authors = set(train_clean[col].dropna())
test_authors  = set(test_clean[col].dropna())

common_authors      = train_authors & test_authors
only_train_authors  = train_authors - test_authors
only_test_authors   = test_authors - train_authors

print("=== Author overlap stats ===")
print(f"Train: distinct authors = {len(train_authors)}")
print(f"Test:  distinct authors = {len(test_authors)}")
print(f"Common authors         = {len(common_authors)}")
print(f"Only in train          = {len(only_train_authors)}")
print(f"Only in test           = {len(only_test_authors)}")

if len(train_authors) > 0:
    print(f"\n% of train authors also in test: {100 * len(common_authors) / len(train_authors):.2f}%")
if len(test_authors) > 0:
    print(f"% of test authors also in train: {100 * len(common_authors) / len(test_authors):.2f}%")

# --- Tweets per author in each set ---
train_counts = train_clean[col].value_counts(dropna=True)
test_counts  = test_clean[col].value_counts(dropna=True)

# DataFrame for common authors with tweet counts in each split
common_df = (
    pd.DataFrame({
        "train_tweets": train_counts,
        "test_tweets": test_counts,
    })
    .loc[list(common_authors)]          # keep only common authors
    .fillna(0)
    .astype(int)
    .sort_values(["train_tweets", "test_tweets"], ascending=False)
)

print("\n=== Top 10 overlapping authors (by train tweets) ===")
print(common_df.head(10))

=== Author overlap stats ===
Train: distinct authors = 49065
Test:  distinct authors = 32569
Common authors         = 0
Only in train          = 49065
Only in test           = 32569

% of train authors also in test: 0.00%
% of test authors also in train: 0.00%

=== Top 10 overlapping authors (by train tweets) ===
Empty DataFrame
Columns: [train_tweets, test_tweets]
Index: []
