In [5]:
# Data handling libraries
import json
import numpy as np
import pandas as pd
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords

# Scikit-learn modeling libraries
from sklearn.dummy import DummyClassifier # For baseline model
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text to numbers
from sklearn.linear_model import LogisticRegression # The classifier model
from sklearn.metrics import accuracy_score, classification_report # For evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score # For splitting and validating
from sklearn.pipeline import Pipeline # To chain processing steps

# 1. Data Loading

In [6]:
# Load the training data from a JSON Lines file (one JSON object per line)
train_data = pd.read_json('train.jsonl', lines=True)
# The tweet data is nested. json_normalize flattens the nested JSON into columns.
train_data = json_normalize(train_data.to_dict(orient='records'))

# Load the Kaggle test data (which we will make predictions on)
kaggle_data = pd.read_json('kaggle_test.jsonl', lines=True)
# Also normalize the Kaggle data
kaggle_data = json_normalize(kaggle_data.to_dict(orient='records'))


# Separate features from the target variable for the training set
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

X_kaggle = kaggle_data

# 2. Transforming into DataFrame

In [7]:
# Define a function to get the full text from a tweet object.
# Tweets can be truncated, storing the full version in 'extended_tweet.full_text'.
def extract_full_text(tweet):
    # Start with the standard 'text' field
    text = tweet['text']
    # Check if the 'extended_tweet.full_text' field exists (is not NaN)
    if not pd.isna(tweet['extended_tweet.full_text']):
        # If it exists, it's the full text, so use it instead
        text = tweet['extended_tweet.full_text']
    return text

# Apply this function to every row (axis=1) in the training data
X_train['full_text'] = X_train.apply(lambda tweet: extract_full_text(tweet), axis=1)
# Apply the same function to the Kaggle test data
X_kaggle['full_text'] = X_kaggle.apply(lambda tweet: extract_full_text(tweet), axis=1)

In [None]:
import pandas as pd
import numpy as np
import re, ast

def parse_tweets(path, expect_label=True):
    # Load & flatten
    df = pd.read_json(path, lines=True)
    df = pd.json_normalize(df.to_dict(orient="records"), sep=".")

    # Ensure expected nested columns exist
    for col in [
        "text", "extended_tweet.full_text", "source",
        "entities.hashtags", "entities.user_mentions", "entities.urls",
        "extended_entities.media",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # Full text (vectorized, avoids apply/axis=1)
    df["full_text"] = df["extended_tweet.full_text"].fillna(df["text"]).fillna("")

    # Engagement (create if missing)
    for col in ["retweet_count", "favorite_count", "reply_count", "quote_count"]:
        if col not in df.columns:
            df[col] = 0

    # Safe length for list-like fields (sometimes lists, sometimes stringified)
    def safe_len(x):
        if isinstance(x, list):
            return len(x)
        if isinstance(x, str):
            try:
                v = ast.literal_eval(x)
                return len(v) if isinstance(v, (list, tuple)) else 1
            except Exception:
                return 0
        return 0

    df["n_hashtags"] = df["entities.hashtags"].apply(safe_len)
    df["n_mentions"] = df["entities.user_mentions"].apply(safe_len)
    df["n_urls"]     = df["entities.urls"].apply(safe_len)

    # Media presence (avoid .get(...).apply on a scalar)
    df["has_media"] = df["extended_entities.media"].apply(lambda x: safe_len(x) > 0)

    # Source app (extract readable name from HTML anchor)
    def extract_source(x):
        if not isinstance(x, str):
            return "Unknown"
        m = re.search(r'>([^<]+)<', x)
        return m.group(1) if m else x

    df["source_app"] = df["source"].apply(extract_source)

    # User fields (create if missing)
    for col in [
        "user.description", "user.location",
        "user.favourites_count", "user.statuses_count", "user.listed_count"
    ]:
        if col not in df.columns:
            df[col] = np.nan
    df["user.description"] = df["user.description"].fillna("")

    # Keep relevant columns (only those that exist)
    #"lang" -> always french
    keep_cols = [
        "id_str", "full_text", "source_app",
        #"retweet_count", "favorite_count", "reply_count", "quote_count",
        "n_hashtags", "n_mentions", "n_urls", "has_media",
        "user.description", "user.location", "user.favourites_count",
        "user.statuses_count", "user.listed_count",
    ]
    existing = [c for c in keep_cols if c in df.columns]
    out = df[existing].copy()

    # Attach label if expected and available
    if expect_label and "label" in df.columns:
        out["label"] = df["label"]
    elif expect_label and "label" not in df.columns:
        print("Warning: 'label' not found in this file; returning features only.")

    # Optional: show which expected columns were missing
    missing = sorted(set(keep_cols) - set(existing))
    if missing:
        print("Note: missing columns created or omitted:", missing)

    return out

# Usage
train_clean = parse_tweets("train.jsonl", expect_label=True)
#print(train_clean)#.head())
# "text" -> add as an input text, with a LoRa layer to finetune
print(set(train_clean['user.listed_count'])) # Apply log1p transformation
print(set(train_clean['user.description'])) # Add as a second text input, with its own LoRa
print(set(train_clean['user.statuses_count']))
print(set(train_clean['source_app'])) # Create 15 buckets for the 15 most common locations, and an "other" bucket. Add a trainable embedding for this feature.
print(set(train_clean['n_mentions'])) 


# print(set(train_clean['has_media'])) # Optional
# print(set(train_clean['n_hashtags'])) # Optional






# print(set(train_clean['n_urls']))
#print(set(train_clean['user.favourites_count']))
#print(set(train_clean['user.location'])) # Create 20 buckets for the 20 most common locations, and an "other" bucket. Add a trainable embedding for this feature.
print(train_clean['source_app'].value_counts().head(20))
print(train_clean['user.location'].value_counts().head(20))

# For Kaggle test:
# test_clean = parse_tweets("kaggle_test.jsonl", expect_label=False)

{'', 'Pause Fun', 'LinkedIn', 'Integromat', 'Le Journal de Joliette', 'Khoros', 'Javascript Newss', 'Eauto Check', 'Dynamic Signal', 'Twitter for iPad', 'theglobe', 'Melody HCR', 'Cheap Bots, Done Quick!', 'OverBlog Kiwi', 'ahlam ahlam', 'Zoho Social', 'Neutron Jimm', 'Accelerate Twitter Demo1', 'PreProd app senti', 'Nouvelles sur RIMQ', 'Twitter for Android', 'WP to Twitter Pro', 'newsnet-app', 'Alertes SNCF Transilien', 'Neatly For BlackBerry 10', 'TVMag Méthode App', 'Salesforce - Social Studio', "L'Observateur V5", 'Wildmoka', 'Figaro Economie Méthode App', 'Mashup Web', 'Post Planner Inc.', 'Djib_s', 'avmtest', 'Malivox.net', 'TAG.FR PRODUCTION', 'eClincher', 'Senejournal Tweets', 'Docteur imago', 'Picta Presse', 'TwidereX-Android', 'quarantineOpportunity', 'Chambly Express', 'yoyoyo_v3', 'Moutons Enragés', 'Sharee Advocacy', 'Twitter Media Studio', 'http://www.gerontonews.com', 'WP Auto T', 'Scopalto', 'La Commère 43', 'Twitter for Mac', 'iOS', 'Linky for iOS', 'fdesouche.com', '

In [22]:
import numpy as np
import pandas as pd

from scipy.stats import pointbiserialr, spearmanr, mannwhitneyu, chi2_contingency, fisher_exact
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm

def _safe_series(df, name):
    return name in df.columns

def _group_topk(s: pd.Series, k: int):
    vc = s.value_counts(dropna=False)
    keep = set(vc.head(k).index)
    return s.where(s.isin(keep), other="Other")

def analyze_influencer_correlations(
    df: pd.DataFrame,
    label_col: str = "label",
    numeric_feats=("n_hashtags","n_mentions","n_urls","user.statuses_count","user.listed_count"),
    categorical_feats=("source_app","user.location","has_media"),
    topk_map={"source_app": 10, "user.location": 20}
):
    assert label_col in df.columns, f"'{label_col}' not in df"

    # Ensure binary ints
    y = df[label_col].astype(int)

    # --- NUMERIC FEATURES ---
    num_rows = []
    for feat in numeric_feats:
        if not _safe_series(df, feat): 
            continue
        x = df[feat]
        # drop missing and non-finite
        mask = x.notna() & np.isfinite(x) & y.notna()
        x = x[mask].astype(float)
        y_ = y[mask].astype(int)
        if len(x) < 30 or y_.nunique() < 2:
            continue

        # Basic group stats
        g0 = x[y_==0]
        g1 = x[y_==1]
        mean0, mean1 = g0.mean(), g1.mean()
        std0, std1   = g0.std(ddof=1), g1.std(ddof=1)

        # Correlations
        try:
            pb_r, pb_p = pointbiserialr(y_, x)
        except Exception:
            pb_r, pb_p = np.nan, np.nan

        try:
            sp_rho, sp_p = spearmanr(y_, x)
        except Exception:
            sp_rho, sp_p = np.nan, np.nan

        # Mann-Whitney (non-parametric)
        try:
            mw_u, mw_p = mannwhitneyu(g0, g1, alternative="two-sided")
        except Exception:
            mw_u, mw_p = np.nan, np.nan

        # 1-feature logistic (coef p-value) + AUC
        try:
            X = sm.add_constant(x.values, has_constant="add")
            model = sm.Logit(y_.values, X, missing="drop").fit(disp=False)
            # coef for the feature is at index 1 (after const)
            logit_p = model.pvalues[1] if len(model.pvalues) > 1 else np.nan
            auc = roc_auc_score(y_, x)
        except Exception:
            logit_p, auc = np.nan, np.nan

        num_rows.append({
            "feature": feat,
            "n": int(len(x)),
            "mean_0": mean0, "mean_1": mean1,
            "std_0": std0,   "std_1": std1,
            "pointbiserial_r": pb_r, "pointbiserial_p": pb_p,
            "spearman_rho": sp_rho,  "spearman_p": sp_p,
            "mannwhitney_U": mw_u,   "mannwhitney_p": mw_p,
            "logit_p": logit_p,
            "auc_univariate": auc
        })

    numeric_results = pd.DataFrame(num_rows).sort_values(["logit_p","pointbiserial_p","mannwhitney_p"], na_position="last")

    # --- CATEGORICAL / BOOLEAN FEATURES ---
    cat_rows = []
    for feat in categorical_feats:
        if not _safe_series(df, feat):
            continue

        s = df[feat]
        # Group high-cardinality
        if feat in topk_map:
            s = _group_topk(s.astype(object).fillna("Unknown"), topk_map[feat])
        else:
            s = s.astype(object).fillna("Unknown")

        # Drop rows with missing label
        mask = y.notna() & s.notna()
        s = s[mask]
        y_ = y[mask].astype(int)
        if len(s) < 30 or y_.nunique() < 2:
            continue

        # Contingency: rows = label(0/1), cols = category levels
        ct = pd.crosstab(y_, s)
        if ct.shape[1] < 2:  # only one category after grouping
            continue

        n = ct.values.sum()
        r, c = ct.shape

        if r == 2 and c == 2:
            # Prefer Fisher for 2x2
            odds, p_fisher = fisher_exact(ct.values)
            chi2, p_chi2, dof, _ = chi2_contingency(ct, correction=False)
            # Cramér's V
            cramers_v = np.sqrt((chi2 / n) / (min(r-1, c-1)))
            cat_rows.append({
                "feature": feat, "k_levels": c, "n": int(n),
                "test": "Fisher (2x2) + Chi2",
                "chi2": chi2, "df": dof, "p_value": p_fisher,
                "cramers_v": cramers_v,
                "top_levels_by_rate": (
                    (ct.loc[1] / ct.sum(axis=0))
                    .sort_values(ascending=False)
                    .head(5)
                    .round(3)
                    .to_dict()
                )
            })
        else:
            chi2, p, dof, _ = chi2_contingency(ct, correction=False)
            cramers_v = np.sqrt((chi2 / n) / (min(r-1, c-1)))
            cat_rows.append({
                "feature": feat, "k_levels": c, "n": int(n),
                "test": "Chi-square",
                "chi2": chi2, "df": dof, "p_value": p,
                "cramers_v": cramers_v,
                "top_levels_by_rate": (
                    (ct.loc[1] / ct.sum(axis=0))
                    .sort_values(ascending=False)
                    .head(5)
                    .round(3)
                    .to_dict()
                )
            })

    categorical_results = pd.DataFrame(cat_rows).sort_values(["p_value"], na_position="last")

    return numeric_results, categorical_results

# ---- RUN (expects train_clean with a binary 'label') ----
numeric_results, categorical_results = analyze_influencer_correlations(train_clean)
print("\n=== NUMERIC FEATURES (sorted by significance) ===")
display(numeric_results)

print("\n=== CATEGORICAL / BOOLEAN FEATURES (sorted by p-value) ===")
display(categorical_results)


=== NUMERIC FEATURES (sorted by significance) ===


  s = s.astype(object).fillna("Unknown")


Unnamed: 0,feature,n,mean_0,mean_1,std_0,std_1,pointbiserial_r,pointbiserial_p,spearman_rho,spearman_p,mannwhitney_U,mannwhitney_p,logit_p,auc_univariate
1,n_mentions,154914,0.861214,0.397135,1.298493,0.889542,-0.201323,0.0,-0.227116,0.0,3657434000.0,0.0,0.0,0.387607
3,user.statuses_count,154914,10693.365145,43771.451426,29118.885351,76408.895209,0.28105,0.0,0.45556,0.0,1411770000.0,0.0,0.0,0.763616
4,user.listed_count,154914,6.79101,132.786559,29.500263,1167.234005,0.078584,1.075484e-210,0.615137,0.0,896907400.0,0.0,0.0,0.849824
0,n_hashtags,154914,0.248022,0.373048,0.815563,0.882041,0.073421,4.1373389999999996e-184,0.111075,0.0,2737765000.0,0.0,6.3959860000000005e-177,0.541595
2,n_urls,154914,0.614885,0.62709,0.509241,0.511597,0.01193,2.657094e-06,0.011172,1.1e-05,2953254000.0,1.1e-05,2.661511e-06,0.505514



=== CATEGORICAL / BOOLEAN FEATURES (sorted by p-value) ===


Unnamed: 0,feature,k_levels,n,test,chi2,df,p_value,cramers_v,top_levels_by_rate
0,source_app,11,154914,Chi-square,10809.050569,10,0.0,0.264149,"{'Hootsuite Inc.': 0.91, 'dlvr.it': 0.901, 'Bu..."
1,user.location,21,154914,Chi-square,6626.322052,20,0.0,0.206819,"{'Paris': 0.705, 'Canada': 0.666, 'Montréal': ..."
2,has_media,2,154914,Fisher (2x2) + Chi2,652.519396,1,1.600364e-144,0.064901,"{True: 0.64, False: 0.46}"


# 3. Logistic Regression Classifier

In [4]:
# Load a list of common French stop words (e.g., 'le', 'la', 'de')
french_stop_words = stopwords.words('french')

print("\nBuilding model pipeline...")

# Create a scikit-learn Pipeline. This chains steps together.
# Data will flow from 'tfidf' (text to numbers) to 'clf' (classifier).
model_pipeline = Pipeline([
    # Step 1: TfidfVectorizer - converts text into a matrix of TF-IDF features
    ('tfidf', TfidfVectorizer(
        stop_words=french_stop_words, # Remove French stop words
        max_df=0.7,       # Ignore words that appear in > 70% of tweets (too common)
        min_df=3,         # Ignore words that appear in < 3 tweets (too rare)
        max_features=1000, # Keep only the top 1000 features
        ngram_range=(1, 2)  # Include 1-word (unigrams) and 2-word (bigrams) sequences
    )),
    # Step 2: Classifier - Logistic Regression
    ('clf', LogisticRegression(
        random_state=42,    # For reproducible results
        solver='liblinear'  # Good solver for this type of problem
    ))
])

print("\nRunning 5-Fold Cross-Validation on training data...")

# Use StratifiedKFold to ensure class proportions are maintained in each fold
# This is important for datasets that might be imbalanced
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cross_val_score will train and test the pipeline 5 times
# using the K-fold splits of the *training data*
scores = cross_val_score(
    model_pipeline,          # The pipeline to evaluate
    X_train['full_text'],  # Features from training set
    y_train,               # Labels from training set
    cv=kfold,              # The stratified 5-fold splitter
    scoring='accuracy'     # The metric to evaluate
)

# Print the cross-validation results
print(f"K-Fold Accuracy Scores: {scores}")
print(f"Mean K-Fold Accuracy: {np.mean(scores) * 100:.2f}%")
print(f"Std Dev K-Fold Accuracy: {np.std(scores) * 100:.2f}%")


print("\nTraining final model on all training data...")
# Now that we've validated the model, train it on ALL available training data
model_pipeline.fit(X_train['full_text'], y_train)
print("Training complete.")

print("\n--- Final Model Evaluation on Held-Out Test Set ---")
# Use the trained pipeline to make predictions on the unseen Kaggle data
# The pipeline automatically applies the TF-IDF transform and then predicts
y_pred_test = model_pipeline.predict(X_kaggle['full_text'])

# Prepare the submission file
# Combine the 'challenge_id' from the Kaggle data with our predictions
output = pd.concat([X_kaggle['challenge_id'], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ['ID', "Prediction"]
# Save the submission file as a CSV
output.to_csv('logistic_regression.csv', index=False)


Building model pipeline...

Running 5-Fold Cross-Validation on training data...
K-Fold Accuracy Scores: [0.62589162 0.6281832  0.62066294 0.62337411 0.63036602]
Mean K-Fold Accuracy: 62.57%
Std Dev K-Fold Accuracy: 0.34%

Training final model on all training data...
Training complete.

--- Final Model Evaluation on Held-Out Test Set ---


# 4. Dummy Classifier

In [5]:
print("\nTraining Dummy (Most Frequent)...")
# Create a DummyClassifier that always predicts the most frequent class
# This is a baseline to see if our Logistic Regression model is actually learning anything
dummy_mf = DummyClassifier(strategy="most_frequent")

# "Train" the dummy model (it just finds the most frequent class in y_train)
dummy_mf.fit(X_train['full_text'], y_train)

# Make predictions on the Kaggle data (it will predict the same class for all rows)
y_pred_test = dummy_mf.predict(X_kaggle['full_text'])

# Prepare and save the dummy submission file
output = pd.concat([X_kaggle['challenge_id'], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
output.columns = ['ID', "Prediction"]
output.to_csv('dummy.csv', index=False)


Training Dummy (Most Frequent)...
