# ODSB-15998: ITCT Analysis - All Clicks by IP

**Objective:** Analyze Impression-to-Click Time (ITCT) patterns for ALL clicks and identify if abnormal behaviors come from specific IPs.

**Advertiser:** `h4khOQuVmBR7OV2S`  
**Period:** 2026-01-01 to 2026-01-20

**Approach:** Analyze IP patterns at user level (not limited to 30+ click MTIDs)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/)

**Note:** Run the authentication cell below first before executing any BigQuery queries.

In [None]:
# ===========================================
# Google Colab Authentication (Run this first)
# ===========================================
from google.colab import auth
auth.authenticate_user()
print('✓ Authenticated successfully!')

In [53]:
# Setup (Authentication handled in cell above)
from google.cloud import bigquery
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

client = bigquery.Client(project='moloco-ods')

# Configuration
ADVERTISER_ID = 'h4khOQuVmBR7OV2S'
START_DATE = '2026-01-01'
END_DATE = '2026-01-20'
MIN_CLICK_THRESHOLD = 30

print(f'Advertiser: {ADVERTISER_ID}')
print(f'Period: {START_DATE} to {END_DATE}')
print(f'Click Threshold: {MIN_CLICK_THRESHOLD}+ clicks per MTID')

Advertiser: h4khOQuVmBR7OV2S
Period: 2026-01-01 to 2026-01-20
Click Threshold: 30+ clicks per MTID


In [54]:
#@title Step 1: Q1 - Compare click vs click_surplus tables (clicks per MTID)

# Q1: Does click table have only 1 click per MTID? vs click_surplus has multiple?
query_click_comparison = f"""
WITH click_table AS (
    SELECT
        bid.mtid,
        COUNT(*) AS click_count
    FROM `focal-elf-631.prod_stream_view.click`
    WHERE api.advertiser.id = '{ADVERTISER_ID}'
        AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
    GROUP BY bid.mtid
),
click_surplus_table AS (
    SELECT
        bid.mtid,
        COUNT(*) AS click_count
    FROM `focal-elf-631.prod_stream_view.click_surplus`
    WHERE api.advertiser.id = '{ADVERTISER_ID}'
        AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
    GROUP BY bid.mtid
)

SELECT
    'click' AS table_name,
    COUNT(*) AS total_mtids,
    SUM(click_count) AS total_clicks,
    AVG(click_count) AS avg_clicks_per_mtid,
    MAX(click_count) AS max_clicks_per_mtid,
    COUNTIF(click_count = 1) AS mtids_with_1_click,
    COUNTIF(click_count > 1) AS mtids_with_multiple_clicks
FROM click_table

UNION ALL

SELECT
    'click_surplus' AS table_name,
    COUNT(*) AS total_mtids,
    SUM(click_count) AS total_clicks,
    AVG(click_count) AS avg_clicks_per_mtid,
    MAX(click_count) AS max_clicks_per_mtid,
    COUNTIF(click_count = 1) AS mtids_with_1_click,
    COUNTIF(click_count > 1) AS mtids_with_multiple_clicks
FROM click_surplus_table
"""

df_click_comparison = client.query(query_click_comparison).to_dataframe()
print("=" * 70)
print("Q1: Click vs Click_Surplus Table Comparison")
print("=" * 70)
print(df_click_comparison.to_string(index=False))
print("\n→ If click table has mostly 1 click per MTID, it logs only the FIRST valid click.")

query_suspicious_mtid = f"""
WITH event AS (
    SELECT
        req.device.ifa,
        req.exchange,
        req.app.bundle,
        req.app.publisher.id AS publisher_id,
        req.app.publisher.name AS publisher_name,
        req.imp.tagid,
        cv.pb.event.name as event_name,
        cv.received_at as event_timestamp,
        bid.timestamp as bid_timestamp,
        imp.received_at as imp_timestamp,
        click.received_at as click_timestamp,
        REGEXP_EXTRACT(cv.pb.payload.raw, r'&moloco_mtid=([^&]*)') AS mtid_value,
    FROM `focal-elf-631.prod_stream_view.cv`
    WHERE api.advertiser.id = '{ADVERTISER_ID}'
        AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
)

SELECT
    ifa,
    exchange,
    bundle,
    tagid,
    mtid_value,
    event_name,
    COUNT(DISTINCT click_timestamp) AS click_cnt,
    COUNT(DISTINCT event_timestamp) AS event_cnt
FROM event
WHERE event_name = 'deeplink'
GROUP BY ALL
HAVING event_cnt > {MIN_CLICK_THRESHOLD}
ORDER BY event_cnt DESC
"""

df_suspicious_mtid = client.query(query_suspicious_mtid).to_dataframe()
print(f"Suspicious MTIDs with {MIN_CLICK_THRESHOLD}+ clicks: {len(df_suspicious_mtid)}")
print(f"Unique IFAs: {df_suspicious_mtid['ifa'].nunique()}")
print(f"Unique Publishers: {df_suspicious_mtid['bundle'].nunique()}")
df_suspicious_mtid.head(20)

Q1: Click vs Click_Surplus Table Comparison
   table_name  total_mtids  total_clicks  avg_clicks_per_mtid  max_clicks_per_mtid  mtids_with_1_click  mtids_with_multiple_clicks
        click       757177        757179             1.000003                    2              757175                           2
click_surplus       183465        387988             2.114779                  903              115022                       68443

→ If click table has mostly 1 click per MTID, it logs only the FIRST valid click.
Suspicious MTIDs with 30+ clicks: 22
Unique IFAs: 5
Unique Publishers: 3


Unnamed: 0,ifa,exchange,bundle,tagid,mtid_value,event_name,click_cnt,event_cnt
0,739e2e23-4d76-44b9-9067-3a8676c6e2ae,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6628a58e8c8d1d780dbb6ffb,ChD8thKOosFLo4Prqygk1MwwEJ-t-coGGhQIARoQc54uI0...,deeplink,1,96
1,739e2e23-4d76-44b9-9067-3a8676c6e2ae,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6628a58e8c8d1d780dbb6ffb,ChD4ENFwgKJL0Z10t6KEz-HFEPH1qMsGGhQIARoQc54uI0...,deeplink,1,79
2,cb16bfcd-608f-42a6-abe4-738ba75bdb1e,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6628a58e8c8d1d780dbb6ffb,ChBXmcUT7AdKkp55mndOhCCjELnnh8sGGhQIARoQyxa_zW...,deeplink,1,74
3,cb16bfcd-608f-42a6-abe4-738ba75bdb1e,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6618f23e8c8d1d6afd52744e,ChAYOrqG-9hDt6j_RSmX9j54EIai3soGGhQIARoQyxa_zW...,deeplink,1,70
4,5b260e61-b287-4fa6-b85d-7206dc51192c,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6618f23e8c8d1d6afd52744e,ChDfwQk0xkVMibkE9AzZxlaCEIjS_MoGGhQIARoQWyYOYb...,deeplink,1,62
5,cb16bfcd-608f-42a6-abe4-738ba75bdb1e,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6618f2638c8d1d6a54cdc3ae,ChA3B5QBdQNC8ZY8RAjU_oxfEN3u_coGGhQIARoQyxa_zW...,deeplink,1,61
6,cb16bfcd-608f-42a6-abe4-738ba75bdb1e,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6618f23e8c8d1d6afd52744e,ChAkgf4DrjpC6pCJqxj1KK6VENXph8sGGhQIARoQyxa_zW...,deeplink,1,60
7,cb16bfcd-608f-42a6-abe4-738ba75bdb1e,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6628a58e8c8d1d780dbb6ffb,ChC60Xox4LVH2YqjgVKIE23jEP2h3soGGhQIARoQyxa_zW...,deeplink,1,59
8,739e2e23-4d76-44b9-9067-3a8676c6e2ae,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6618f23e8c8d1d6afd52744e,ChCnX1CR6xdG4oI93PF0t_QlEKet-coGGhQIARoQc54uI0...,deeplink,1,54
9,739e2e23-4d76-44b9-9067-3a8676c6e2ae,ADPIE,com.ltlk,56b4002f7174ea39c74df8aa:6628a58e8c8d1d780dbb6ffb,ChBSpImwJ_xOvoNACMPrcbySEI-45MoGGhQIARoQc54uI0...,deeplink,1,47


In [55]:
#@title Step 1-1: Q1-1 - Deeplink Event Time vs Click Time Comparison

query_deeplink_vs_click = f"""
WITH deeplink_events AS (
    SELECT
        REGEXP_EXTRACT(cv.pb.payload.raw, r'&moloco_mtid=([^&]*)') AS mtid_value,
        cv.received_at AS deeplink_timestamp,
        req.device.ifa
    FROM `focal-elf-631.prod_stream_view.cv`
    WHERE api.advertiser.id = '{ADVERTISER_ID}'
        AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
        AND cv.pb.event.name = 'deeplink'
),
click_surplus_events AS (
    SELECT
        bid.mtid,
        click.happened_at AS click_timestamp,
        req.device.ifa
    FROM `focal-elf-631.prod_stream_view.click_surplus`
    WHERE api.advertiser.id = '{ADVERTISER_ID}'
        AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
)

SELECT
    d.mtid_value,
    d.ifa,
    d.deeplink_timestamp,
    c.click_timestamp,
    TIMESTAMP_DIFF(d.deeplink_timestamp, c.click_timestamp, SECOND) AS deeplink_minus_click_sec
FROM deeplink_events d
JOIN click_surplus_events c
    ON d.mtid_value = c.mtid AND d.ifa = c.ifa
WHERE d.mtid_value IS NOT NULL
ORDER BY ABS(TIMESTAMP_DIFF(d.deeplink_timestamp, c.click_timestamp, SECOND)) DESC
LIMIT 1000
"""

df_deeplink_vs_click = client.query(query_deeplink_vs_click).to_dataframe()

print("=" * 70)
print("Q1-1: Deeplink Event Time vs Click Time (click_surplus)")
print("=" * 70)
print(f"\nSample size: {len(df_deeplink_vs_click)}")
print(f"\nTime Difference (deeplink - click) in seconds:")
print(df_deeplink_vs_click['deeplink_minus_click_sec'].describe())

print(f"\nDistribution of time difference:")
bins = [-float('inf'), -60, -10, -1, 0, 1, 10, 60, float('inf')]
labels = ['<-60s', '-60 to -10s', '-10 to -1s', '-1 to 0s', '0 to 1s', '1 to 10s', '10 to 60s', '>60s']
df_deeplink_vs_click['diff_bucket'] = pd.cut(df_deeplink_vs_click['deeplink_minus_click_sec'], bins=bins, labels=labels)
print(df_deeplink_vs_click['diff_bucket'].value_counts().sort_index())


query_itct = f"""
SELECT
    req.device.ifa,
    req.device.ip,
    bid.mtid,
    req.exchange,
    req.app.bundle,
    req.app.publisher.id AS publisher_id,
    req.app.publisher.name AS publisher_name,
    req.imp.tagid,
    imp.happened_at AS imp_timestamp,
    click.happened_at AS click_timestamp,
    TIMESTAMP_DIFF(click.happened_at, imp.happened_at, MILLISECOND) AS itct_ms,
    TIMESTAMP_DIFF(click.happened_at, imp.happened_at, SECOND) AS itct_sec,
    click_surplus.reason AS click_surplus_reason
FROM `focal-elf-631.prod_stream_view.click_surplus`
WHERE api.advertiser.id = '{ADVERTISER_ID}'
    AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
ORDER BY req.device.ifa, imp.happened_at, click.happened_at
"""

df_itct = client.query(query_itct).to_dataframe()
print(f"Total click records: {len(df_itct):,}")
print(f"Unique IFAs: {df_itct['ifa'].nunique():,}")
print(f"Unique IPs: {df_itct['ip'].nunique():,}")
print(f"Unique MTIDs: {df_itct['mtid'].nunique():,}")
df_itct.head(20)

Q1-1: Deeplink Event Time vs Click Time (click_surplus)

Sample size: 1000

Time Difference (deeplink - click) in seconds:
count         1000.0
mean       66539.397
std      14871.32364
min        -225656.0
25%          57656.0
50%          65367.5
75%         75966.25
max         111364.0
Name: deeplink_minus_click_sec, dtype: Float64

Distribution of time difference:
diff_bucket
<-60s            2
-60 to -10s      0
-10 to -1s       0
-1 to 0s         0
0 to 1s          0
1 to 10s         0
10 to 60s        0
>60s           998
Name: count, dtype: int64
Total click records: 387,988
Unique IFAs: 77,177
Unique IPs: 80,106
Unique MTIDs: 183,465


Unnamed: 0,ifa,ip,mtid,exchange,bundle,publisher_id,publisher_name,tagid,imp_timestamp,click_timestamp,itct_ms,itct_sec,click_surplus_reason
0,,118.235.74,ChCwstJd-aFA3a7QnHjpL3lYEJHT2coGGhQIBRoQs33h20...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-9PaWqQNQpqtf5FFM,NaT,2026-01-01 12:33:27.815026+00:00,,,IGNORE_REPORT_TO_MMP
1,,211.235.90,ChBmnX1rCXdI6pu2fxuMk2AKELSl7coGGhQIBRoQtb8txE...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fu8OO1UnClBkAZJI,NaT,2026-01-05 05:57:09.314092+00:00,,,IGNORE_REPORT_TO_MMP
2,,106.101.196,ChCYGPN4kHlLQYgx3Yg9oNRYENDu7coGGhQIBRoQOflZ0D...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fXGthnQXmT8bskWu,NaT,2026-01-05 08:33:21.933582+00:00,,,IGNORE_REPORT_TO_MMP
3,,211.235.80,ChCmIAhi7tVEXqpYouE2FiNsEILRicsGGhQIBRoQW3-a4v...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-eIoAe5gVGnPuUvys,NaT,2026-01-10 14:56:03.319808+00:00,,,IGNORE_REPORT_TO_MMP
4,,118.235.92,ChAawPwaLtxMW7oyCNtWzdyjEK312MoGGhQIBRoQq43vKB...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-1ib3qn9ggkhtc,2026-01-01 09:13:17.514568+00:00,2026-01-01 09:13:28.069705+00:00,10555.0,10.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
5,,182.231.26,ChAhFGMTzRlNJLsujzL3GFNQEJac2coGGhQIBRoQgJE3iZ...,KAKAO,com.jobkorea.app,vOc,GENERAL,DAN-BjCQma5Oay7mt7HU,2026-01-01 10:36:08.365337+00:00,2026-01-01 10:36:37.841352+00:00,29476.0,29.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
6,,106.101.80,ChC2xoYX5u1E15ImgrnZh9NuEJPa2soGGhQIBRoQouqYpb...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-kUTB5NrFaPR9bAKq,2026-01-01 17:21:24.625163+00:00,2026-01-01 17:21:25.822350+00:00,1197.0,1.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
7,,223.39.80,ChAaee1rrnJOHJ40NO0yqwv2EIan3MoGGhQIBRoQaqvwaz...,KAKAO,com.skt.prod.dialer,leN,GENERAL,DAN-D765XmBR7qyQcGdU,2026-01-02 00:38:32.572828+00:00,2026-01-02 00:38:40.328680+00:00,7755.0,7.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
8,,106.101.69,ChDAJwzuh7VCrJw7w6jNmKU4EO2i3soGGhQIBRoQYmVQ0l...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fXGthnQXmT8bskWu,2026-01-02 09:35:42.012723+00:00,2026-01-02 09:35:43.421316+00:00,1408.0,1.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
9,,211.235.88,ChAVJ43pDaZHgK8GPgDvHHeIEJjR4coGGhQIBRoQIzbIzb...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-1iykzv03dd4rq,2026-01-03 00:53:45.344225+00:00,2026-01-03 00:53:51.888636+00:00,6544.0,6.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"


In [56]:
#@title Step 2: Get ITCT (Impression to Click Time) with IP for ALL Clicks

query_itct = f"""
SELECT
    req.device.ifa,
    req.device.ip,
    bid.mtid,
    req.exchange,
    req.app.bundle,
    req.app.publisher.id AS publisher_id,
    req.app.publisher.name AS publisher_name,
    req.imp.tagid,
    imp.happened_at AS imp_timestamp,
    click.happened_at AS click_timestamp,
    TIMESTAMP_DIFF(click.happened_at, imp.happened_at, MILLISECOND) AS itct_ms,
    TIMESTAMP_DIFF(click.happened_at, imp.happened_at, SECOND) AS itct_sec,
    click_surplus.reason AS click_surplus_reason
FROM `focal-elf-631.prod_stream_view.click_surplus`
WHERE api.advertiser.id = '{ADVERTISER_ID}'
    AND timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'
ORDER BY req.device.ifa, imp.happened_at, click.happened_at
"""

df_itct = client.query(query_itct).to_dataframe()
print(f"Total click records: {len(df_itct):,}")
print(f"Unique IFAs: {df_itct['ifa'].nunique():,}")
print(f"Unique IPs: {df_itct['ip'].nunique():,}")
print(f"Unique MTIDs: {df_itct['mtid'].nunique():,}")
df_itct.head(20)

Total click records: 387,988
Unique IFAs: 77,177
Unique IPs: 80,106
Unique MTIDs: 183,465


Unnamed: 0,ifa,ip,mtid,exchange,bundle,publisher_id,publisher_name,tagid,imp_timestamp,click_timestamp,itct_ms,itct_sec,click_surplus_reason
0,,118.235.74,ChCwstJd-aFA3a7QnHjpL3lYEJHT2coGGhQIBRoQs33h20...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-9PaWqQNQpqtf5FFM,NaT,2026-01-01 12:33:27.815026+00:00,,,IGNORE_REPORT_TO_MMP
1,,211.235.90,ChBmnX1rCXdI6pu2fxuMk2AKELSl7coGGhQIBRoQtb8txE...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fu8OO1UnClBkAZJI,NaT,2026-01-05 05:57:09.314092+00:00,,,IGNORE_REPORT_TO_MMP
2,,106.101.196,ChCYGPN4kHlLQYgx3Yg9oNRYENDu7coGGhQIBRoQOflZ0D...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fXGthnQXmT8bskWu,NaT,2026-01-05 08:33:21.933582+00:00,,,IGNORE_REPORT_TO_MMP
3,,211.235.80,ChCmIAhi7tVEXqpYouE2FiNsEILRicsGGhQIBRoQW3-a4v...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-eIoAe5gVGnPuUvys,NaT,2026-01-10 14:56:03.319808+00:00,,,IGNORE_REPORT_TO_MMP
4,,118.235.92,ChAawPwaLtxMW7oyCNtWzdyjEK312MoGGhQIBRoQq43vKB...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-1ib3qn9ggkhtc,2026-01-01 09:13:17.514568+00:00,2026-01-01 09:13:28.069705+00:00,10555.0,10.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
5,,182.231.26,ChAhFGMTzRlNJLsujzL3GFNQEJac2coGGhQIBRoQgJE3iZ...,KAKAO,com.jobkorea.app,vOc,GENERAL,DAN-BjCQma5Oay7mt7HU,2026-01-01 10:36:08.365337+00:00,2026-01-01 10:36:37.841352+00:00,29476.0,29.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
6,,106.101.80,ChC2xoYX5u1E15ImgrnZh9NuEJPa2soGGhQIBRoQouqYpb...,KAKAO,com.kakao.talk,0,Daum Kakao Corp.,DAN-kUTB5NrFaPR9bAKq,2026-01-01 17:21:24.625163+00:00,2026-01-01 17:21:25.822350+00:00,1197.0,1.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
7,,223.39.80,ChAaee1rrnJOHJ40NO0yqwv2EIan3MoGGhQIBRoQaqvwaz...,KAKAO,com.skt.prod.dialer,leN,GENERAL,DAN-D765XmBR7qyQcGdU,2026-01-02 00:38:32.572828+00:00,2026-01-02 00:38:40.328680+00:00,7755.0,7.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
8,,106.101.69,ChDAJwzuh7VCrJw7w6jNmKU4EO2i3soGGhQIBRoQYmVQ0l...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-fXGthnQXmT8bskWu,2026-01-02 09:35:42.012723+00:00,2026-01-02 09:35:43.421316+00:00,1408.0,1.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"
9,,211.235.88,ChAVJ43pDaZHgK8GPgDvHHeIEJjR4coGGhQIBRoQIzbIzb...,KAKAO,net.daum.android.daum,0,Daum Kakao Corp.,DAN-1iykzv03dd4rq,2026-01-03 00:53:45.344225+00:00,2026-01-03 00:53:51.888636+00:00,6544.0,6.0,"IGNORE_REPORT_TO_MMP,REDUNDANT"


In [57]:
#@title Step 2-1: Q2 - ITCT Comparison: Users with 30+ click MTIDs vs Normal Users (<5 clicks)

print("=" * 70)
print("Q2: Do users with suspicious MTIDs (30+ clicks) show delayed clicks?")
print("=" * 70)

# Calculate clicks per MTID
df_mtid_stats = df_itct.groupby(['mtid', 'ifa']).agg({
    'click_timestamp': 'count',
    'itct_sec': 'mean'
}).reset_index()
df_mtid_stats.columns = ['mtid', 'ifa', 'clicks_per_mtid', 'avg_itct_sec']

# Categorize MTIDs
df_mtid_stats['mtid_category'] = pd.cut(
    df_mtid_stats['clicks_per_mtid'],
    bins=[0, 1, 5, 30, float('inf')],
    labels=['1 click', '2-5 clicks', '6-30 clicks', '30+ clicks']
)

# Get unique IFAs per category
suspicious_ifas = set(df_mtid_stats[df_mtid_stats['clicks_per_mtid'] >= 30]['ifa'].unique())
normal_ifas = set(df_mtid_stats[df_mtid_stats['clicks_per_mtid'] < 5]['ifa'].unique())

# IFAs that are ONLY suspicious (never have normal MTIDs)
suspicious_only_ifas = suspicious_ifas - normal_ifas
normal_only_ifas = normal_ifas - suspicious_ifas

print(f"\nUser (IFA) Categories:")
print(f"  Users with 30+ click MTIDs: {len(suspicious_ifas):,}")
print(f"  Users with <5 click MTIDs: {len(normal_ifas):,}")
print(f"  Users with ONLY 30+ click MTIDs: {len(suspicious_only_ifas):,}")
print(f"  Users with ONLY <5 click MTIDs: {len(normal_only_ifas):,}")

# Compare ITCT by user category
df_itct['user_category'] = df_itct['ifa'].apply(
    lambda x: 'Suspicious Only' if x in suspicious_only_ifas 
    else ('Normal Only' if x in normal_only_ifas else 'Mixed')
)

print(f"\nITCT by User Category:")
itct_by_user = df_itct.groupby('user_category')['itct_sec'].agg(['count', 'mean', 'median', 'std']).round(2)
print(itct_by_user)

# ITCT distribution comparison
print(f"\nITCT Percentiles by User Category:")
for cat in ['Suspicious Only', 'Normal Only', 'Mixed']:
    subset = df_itct[df_itct['user_category'] == cat]['itct_sec']
    if len(subset) > 0:
        print(f"\n  {cat}:")
        for p in [0.25, 0.5, 0.75, 0.9, 0.95]:
            print(f"    P{int(p*100)}: {subset.quantile(p):.2f} sec")

Q2: Do users with suspicious MTIDs (30+ clicks) show delayed clicks?

User (IFA) Categories:
  Users with 30+ click MTIDs: 110
  Users with <5 click MTIDs: 64,989
  Users with ONLY 30+ click MTIDs: 63
  Users with ONLY <5 click MTIDs: 64,942

ITCT by User Category:
                  count       mean  median        std
user_category                                        
Mixed            112755   30298.13   202.0  180862.68
Normal Only      231039    2974.36    21.0   97068.17
Suspicious Only    4908  175852.14  4502.5  743159.73

ITCT Percentiles by User Category:

  Suspicious Only:
    P25: 219.00 sec
    P50: 4502.50 sec
    P75: 29638.00 sec
    P90: 194325.30 sec
    P95: 876620.45 sec

  Normal Only:
    P25: 6.00 sec
    P50: 21.00 sec
    P75: 48.00 sec
    P90: 162.00 sec
    P95: 636.00 sec

  Mixed:
    P25: 68.00 sec
    P50: 202.00 sec
    P75: 12473.50 sec
    P90: 38003.60 sec
    P95: 129196.90 sec


In [58]:
#@title Step 2-2: Q3 - Patterns in Highly Delayed Clicks (ITCT > X seconds)
# Counting DISTINCT MTIDs (not clicks) - one MTID can have both delayed and normal clicks

print("=" * 70)
print("Q3: Patterns in Highly Delayed Clicks (by distinct MTID)")
print("=" * 70)

# Define delay thresholds
DELAY_THRESHOLD_SEC = 150  # Consider clicks >150 seconds as "highly delayed"

df_itct['is_delayed'] = df_itct['itct_sec'] > DELAY_THRESHOLD_SEC

# Get MTIDs with delayed/normal clicks
delayed_mtids = set(df_itct[df_itct['is_delayed']]['mtid'].unique())
normal_mtids = set(df_itct[~df_itct['is_delayed']]['mtid'].unique())
all_mtids = set(df_itct['mtid'].unique())

print(f"\nDelay Threshold: {DELAY_THRESHOLD_SEC} seconds")
print(f"Total unique MTIDs: {len(all_mtids):,}")
print(f"MTIDs with delayed clicks (>{DELAY_THRESHOLD_SEC}s): {len(delayed_mtids):,} ({len(delayed_mtids)/len(all_mtids)*100:.1f}%)")
print(f"MTIDs with normal clicks: {len(normal_mtids):,}")
print(f"MTIDs with BOTH delayed & normal: {len(delayed_mtids & normal_mtids):,}")

# Helper function to count distinct MTIDs by dimension
def count_mtids_by_dimension(df, dimension_cols):
    """Count distinct MTIDs with delayed/normal clicks per dimension"""
    if isinstance(dimension_cols, str):
        dimension_cols = [dimension_cols]
    
    # Get unique MTIDs per dimension for delayed and normal
    delayed_df = df[df['is_delayed']].groupby(dimension_cols)['mtid'].nunique().reset_index()
    delayed_df.columns = dimension_cols + ['delayed_mtids']
    
    normal_df = df[~df['is_delayed']].groupby(dimension_cols)['mtid'].nunique().reset_index()
    normal_df.columns = dimension_cols + ['normal_mtids']
    
    total_df = df.groupby(dimension_cols)['mtid'].nunique().reset_index()
    total_df.columns = dimension_cols + ['total_mtids']
    
    # Merge all
    result = total_df.merge(delayed_df, on=dimension_cols, how='left').merge(normal_df, on=dimension_cols, how='left')
    result = result.fillna(0)
    result['delayed_mtids'] = result['delayed_mtids'].astype(int)
    result['normal_mtids'] = result['normal_mtids'].astype(int)
    result['delayed_pct'] = (result['delayed_mtids'] / result['total_mtids'] * 100).round(1)
    
    return result

# By Exchange (top 30 by delayed_mtids, sorted by delayed_pct)
print(f"\n--- By Exchange (Top 30 by delayed_mtids, sorted by delayed_pct DESC) ---")
exchange_comparison = count_mtids_by_dimension(df_itct, 'exchange')
exchange_top30 = exchange_comparison.nlargest(30, 'delayed_mtids').sort_values('delayed_pct', ascending=False)
print(exchange_top30.to_string(index=False))

# By Publisher (top 30 by delayed_mtids, sorted by delayed_pct)
print(f"\n--- By Publisher (Top 30 by delayed_mtids, sorted by delayed_pct DESC) ---")
pub_comparison = count_mtids_by_dimension(df_itct, ['bundle', 'publisher_name'])
pub_top30 = pub_comparison.nlargest(30, 'delayed_mtids').sort_values('delayed_pct', ascending=False)
print(pub_top30.to_string(index=False))

# By IP (top 30 by delayed_mtids, sorted by delayed_pct)
print(f"\n--- By IP (Top 30 by delayed_mtids, sorted by delayed_pct DESC) ---")
ip_comparison = count_mtids_by_dimension(df_itct, 'ip')
ip_top30 = ip_comparison.nlargest(30, 'delayed_mtids').sort_values('delayed_pct', ascending=False)
print(ip_top30.to_string(index=False))

# IPs with 100% delayed MTIDs (all their MTIDs have delayed clicks)
print(f"\n--- IPs with 100% Delayed MTIDs (min 10 MTIDs) ---")
ip_100pct_delayed = ip_comparison[(ip_comparison['delayed_pct'] == 100) & (ip_comparison['total_mtids'] >= 10)]
ip_100pct_delayed = ip_100pct_delayed.sort_values('total_mtids', ascending=False)
print(f"Count: {len(ip_100pct_delayed)}")
if len(ip_100pct_delayed) > 0:
    print(ip_100pct_delayed.head(20).to_string(index=False))

Q3: Patterns in Highly Delayed Clicks (by distinct MTID)

Delay Threshold: 150 seconds
Total unique MTIDs: 183,465
MTIDs with delayed clicks (>150s): 27,305 (14.9%)
MTIDs with normal clicks: 151,524
MTIDs with BOTH delayed & normal: 14,447

--- By Exchange (Top 30 by delayed_mtids, sorted by delayed_pct DESC) ---
                  exchange  total_mtids  delayed_mtids  normal_mtids  delayed_pct
                   ADMIXER            2              2             0        100.0
                       APS          441            318           134         72.1
      MOLOCO_SDK_LEVELPLAY          234            143           181         61.1
                  PUBMATIC          504            271           169         53.8
            MOLOCO_SDK_MAX        28962          15062         24911         52.0
                   EXELBID          325            117           229         36.0
                    XIAOMI          572            126           265         22.0
                     NAVER   

In [59]:
#@title Step 3: ITCT Distribution Analysis

print("=" * 60)
print("ITCT (Impression to Click Time) Distribution")
print("=" * 60)

# Basic stats
print(f"\nITCT Statistics (in seconds):")
print(df_itct['itct_sec'].describe())

# Percentiles
print(f"\nITCT Percentiles (in seconds):")
percentiles = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
for p in percentiles:
    val = df_itct['itct_sec'].quantile(p)
    print(f"  P{int(p*100):02d}: {val:.2f} sec")

# ITCT buckets
print(f"\nITCT Distribution by Bucket:")
df_itct['itct_bucket'] = pd.cut(
    df_itct['itct_sec'], 
    bins=[-float('inf'), 0, 1, 5, 10, 30, 60, 300, float('inf')],
    labels=['<0s (invalid)', '0-1s', '1-5s', '5-10s', '10-30s', '30-60s', '1-5min', '>5min']
)
bucket_counts = df_itct['itct_bucket'].value_counts().sort_index()
bucket_prop = df_itct['itct_bucket'].value_counts(normalize=True).sort_index()
bucket_df = pd.DataFrame({'count': bucket_counts, 'proportion': bucket_prop})
bucket_df['proportion'] = bucket_df['proportion'].apply(lambda x: f"{x:.2%}")
print(bucket_df)

ITCT (Impression to Click Time) Distribution

ITCT Statistics (in seconds):
count         348702.0
mean      14242.938888
std      158515.619492
min               -3.0
25%               12.0
50%               38.0
75%              167.0
max          7727092.0
Name: itct_sec, dtype: Float64

ITCT Percentiles (in seconds):
  P10: 3.00 sec
  P25: 12.00 sec
  P50: 38.00 sec
  P75: 167.00 sec
  P90: 10315.70 sec
  P95: 32310.95 sec
  P99: 298776.99 sec

ITCT Distribution by Bucket:
itct_bucket
<0s (invalid)     5893
0-1s             13438
1-5s             35303
5-10s            22440
10-30s           73428
30-60s           57537
1-5min           69680
>5min            70983
Name: count, dtype: int64


In [60]:
#@title Step 4: IP Analysis - Click Volume by IP

print("=" * 60)
print("IP Analysis: Click Volume Distribution")
print("=" * 60)

# Clicks per IP
df_ip_clicks = df_itct.groupby('ip').agg({
    'mtid': 'count',  # total clicks
    'ifa': 'nunique',  # unique IFAs
    'itct_sec': ['mean', 'median', 'std']
}).reset_index()
df_ip_clicks.columns = ['ip', 'click_count', 'unique_ifas', 'avg_itct_sec', 'median_itct_sec', 'std_itct_sec']
df_ip_clicks = df_ip_clicks.sort_values('click_count', ascending=False)

print(f"\nTotal unique IPs: {len(df_ip_clicks)}")
print(f"\nTop 20 IPs by click count:")
print(df_ip_clicks.head(20).to_string(index=False))

# IP concentration
total_clicks = df_ip_clicks['click_count'].sum()
df_ip_clicks['cumulative_pct'] = df_ip_clicks['click_count'].cumsum() / total_clicks * 100

print(f"\nIP Concentration:")
for threshold in [50, 70, 80, 90, 95]:
    num_ips = (df_ip_clicks['cumulative_pct'] <= threshold).sum() + 1
    print(f"  {threshold}% of clicks from top {num_ips} IPs ({num_ips/len(df_ip_clicks)*100:.1f}% of all IPs)")

IP Analysis: Click Volume Distribution

Total unique IPs: 80106

Top 20 IPs by click count:
             ip  click_count  unique_ifas  avg_itct_sec  median_itct_sec   std_itct_sec
  220.76.99.215        20464           26  76392.979986          28616.0  160587.258606
    14.52.3.170         8160            7  73697.226498          22107.0  141528.145034
                        8034         2369   25922.42444             13.0  390309.336372
 220.118.43.250         5896           15  126186.42027          30020.0  229467.920331
  58.145.79.165         2433            1    282.120253            152.0     355.347561
  211.62.59.161         2104          717     11.306905              3.0       27.92835
  211.62.59.166         1985          739     65.371658              3.0     614.614859
  211.62.59.163         1982          703     27.631854              3.0     208.327566
  211.62.59.165         1978          753     20.735556              3.0     117.363364
  211.62.59.164         1922

In [61]:
#@title Step 5: Suspicious IP Patterns - High Click Count + Low ITCT

print("=" * 60)
print("Suspicious IP Patterns")
print("=" * 60)

# Flag suspicious IPs: high click count AND low average ITCT
CLICK_THRESHOLD = 50
ITCT_THRESHOLD = 5  # seconds

df_suspicious_ips = df_ip_clicks[
    (df_ip_clicks['click_count'] >= CLICK_THRESHOLD) & 
    (df_ip_clicks['avg_itct_sec'] <= ITCT_THRESHOLD)
].copy()

print(f"\nSuspicious IPs (>={CLICK_THRESHOLD} clicks AND avg ITCT <={ITCT_THRESHOLD}s):")
print(f"Count: {len(df_suspicious_ips)}")
if len(df_suspicious_ips) > 0:
    print(df_suspicious_ips.to_string(index=False))
    
    # What % of total clicks come from suspicious IPs?
    suspicious_clicks = df_suspicious_ips['click_count'].sum()
    print(f"\nSuspicious IP contribution:")
    print(f"  Clicks from suspicious IPs: {suspicious_clicks:,} ({suspicious_clicks/total_clicks*100:.1f}%)")

Suspicious IP Patterns

Suspicious IPs (>=50 clicks AND avg ITCT <=5s):
Count: 18
             ip  click_count  unique_ifas  avg_itct_sec  median_itct_sec  std_itct_sec  cumulative_pct
  112.170.97.57          152            2      4.591837              4.0      6.691597       19.687722
  1.232.170.131          122            1      2.322314              1.0      5.834116       19.951648
  112.151.4.162          120            1       2.87395              3.0      1.866541       20.013763
211.237.100.102          117            1      3.401709              1.0     12.447171       20.043919
   59.12.88.155          103            1      3.621359              2.0      7.352626       20.240574
  115.140.57.11          102            1      4.931373              2.0      7.070031       20.293153
  125.189.76.35           99            2      2.363636              2.0      1.854143       20.397538
  211.62.59.167           94           59      2.045455              1.5      1.463111       2

In [62]:
#@title Step 6: Clicks per MTID by IP

print("=" * 60)
print("Clicks per MTID Analysis by IP")
print("=" * 60)

# How many clicks per MTID, grouped by IP
df_mtid_clicks = df_itct.groupby(['ip', 'mtid']).agg({
    'click_timestamp': 'count',
    'itct_sec': 'mean'
}).reset_index()
df_mtid_clicks.columns = ['ip', 'mtid', 'clicks_per_mtid', 'avg_itct_sec']

# Aggregate to IP level: avg clicks per MTID
df_ip_mtid_pattern = df_mtid_clicks.groupby('ip').agg({
    'mtid': 'count',  # unique MTIDs
    'clicks_per_mtid': ['mean', 'max'],
    'avg_itct_sec': 'mean'
}).reset_index()
df_ip_mtid_pattern.columns = ['ip', 'unique_mtids', 'avg_clicks_per_mtid', 'max_clicks_per_mtid', 'avg_itct_sec']
df_ip_mtid_pattern = df_ip_mtid_pattern.sort_values('avg_clicks_per_mtid', ascending=False)

print(f"\nTop 20 IPs by avg clicks per MTID:")
print(df_ip_mtid_pattern.head(20).to_string(index=False))

# Flag IPs with abnormally high clicks per MTID
MTID_CLICK_THRESHOLD = 10
df_abnormal_ips = df_ip_mtid_pattern[df_ip_mtid_pattern['avg_clicks_per_mtid'] >= MTID_CLICK_THRESHOLD]
print(f"\nIPs with avg >={MTID_CLICK_THRESHOLD} clicks per MTID: {len(df_abnormal_ips)}")

Clicks per MTID Analysis by IP

Top 20 IPs by avg clicks per MTID:
             ip  unique_mtids  avg_clicks_per_mtid  max_clicks_per_mtid    avg_itct_sec
  59.16.130.110             1           269.000000                  269     2962.672862
  112.151.4.162             1           120.000000                  120         2.87395
 220.85.185.187             4           104.000000                  179   191699.999102
    14.52.3.170            88            92.727273                  853    83466.268158
  1.254.164.227             1            79.000000                   79    33226.974684
121.181.112.129             1            76.000000                   76     8801.144737
 58.141.235.170             1            71.000000                   71  5438861.239437
   123.99.67.38             1            64.000000                   64            <NA>
   222.236.3.33             1            62.000000                   62    22081.483871
  115.138.62.15             1            59.000000   

In [63]:
#@title Step 7: Publisher + IP Cross Analysis

print("=" * 60)
print("Publisher + IP Cross Analysis")
print("=" * 60)

# Clicks by publisher and IP
df_pub_ip = df_itct.groupby(['bundle', 'publisher_name', 'ip']).agg({
    'mtid': 'count',
    'ifa': 'nunique',
    'itct_sec': 'mean'
}).reset_index()
df_pub_ip.columns = ['bundle', 'publisher_name', 'ip', 'click_count', 'unique_ifas', 'avg_itct_sec']
df_pub_ip = df_pub_ip.sort_values('click_count', ascending=False)

print(f"\nTop 30 Publisher + IP combinations:")
print(df_pub_ip.head(30).to_string(index=False))

# Publishers with concentrated IP patterns
df_pub_ip_concentration = df_itct.groupby('bundle').agg({
    'ip': 'nunique',
    'mtid': 'count'
}).reset_index()
df_pub_ip_concentration.columns = ['bundle', 'unique_ips', 'total_clicks']
df_pub_ip_concentration['clicks_per_ip'] = df_pub_ip_concentration['total_clicks'] / df_pub_ip_concentration['unique_ips']
df_pub_ip_concentration = df_pub_ip_concentration.sort_values('clicks_per_ip', ascending=False)

print(f"\nPublishers by clicks per unique IP:")
print(df_pub_ip_concentration.to_string(index=False))

Publisher + IP Cross Analysis

Top 30 Publisher + IP combinations:
                   bundle         publisher_name              ip  click_count  unique_ifas   avg_itct_sec
          me.cash.time.v3                          220.76.99.215         6919           18   63925.584804
              me.timecash                            14.52.3.170         5474            6   65124.749726
              me.timecash                          220.76.99.215         5191           20   74796.567899
            cash.money.v1                          220.76.99.215         4135           21   86539.143168
              me.timecash                         220.118.43.250         2672           12   55399.017623
            com.cashslide Delight Room Co., Ltd.                         2268          519    1457.590909
          net.cashplus.b3                          220.76.99.215         1771           16   44445.018089
                 com.ltlk                          58.145.79.165         1431        

In [64]:
#@title Step 8: Compare IPs - Suspicious vs Normal MTIDs

print("=" * 60)
print("IP Comparison: Suspicious (30+ clicks) vs Normal MTIDs")
print("=" * 60)

# Get clicks per MTID
df_mtid_click_count = df_itct.groupby('mtid').agg({
    'click_timestamp': 'count',
    'ip': 'first',
    'ifa': 'first'
}).reset_index()
df_mtid_click_count.columns = ['mtid', 'click_count', 'ip', 'ifa']

# Flag suspicious MTIDs (30+ clicks)
df_mtid_click_count['is_suspicious'] = df_mtid_click_count['click_count'] >= MIN_CLICK_THRESHOLD

print(f"\nMTID Distribution:")
print(f"  Normal MTIDs (<{MIN_CLICK_THRESHOLD} clicks): {(~df_mtid_click_count['is_suspicious']).sum():,}")
print(f"  Suspicious MTIDs (>={MIN_CLICK_THRESHOLD} clicks): {df_mtid_click_count['is_suspicious'].sum():,}")

# Compare IPs between suspicious and normal
suspicious_ips = set(df_mtid_click_count[df_mtid_click_count['is_suspicious']]['ip'].unique())
normal_ips = set(df_mtid_click_count[~df_mtid_click_count['is_suspicious']]['ip'].unique())
overlap_ips = suspicious_ips & normal_ips
suspicious_only_ips = suspicious_ips - normal_ips

print(f"\nIP Analysis:")
print(f"  IPs with suspicious MTIDs: {len(suspicious_ips):,}")
print(f"  IPs with normal MTIDs only: {len(normal_ips - suspicious_ips):,}")
print(f"  IPs with BOTH suspicious & normal: {len(overlap_ips):,}")
print(f"  IPs with ONLY suspicious MTIDs: {len(suspicious_only_ips):,}")

# Show IPs that ONLY have suspicious MTIDs (high-risk IPs)
if len(suspicious_only_ips) > 0:
    print(f"\nHigh-Risk IPs (only suspicious MTIDs):")
    df_high_risk = df_itct[df_itct['ip'].isin(suspicious_only_ips)].groupby('ip').agg({
        'mtid': 'nunique',
        'ifa': 'nunique',
        'itct_sec': 'mean'
    }).reset_index()
    df_high_risk.columns = ['ip', 'unique_mtids', 'unique_ifas', 'avg_itct_sec']
    df_high_risk = df_high_risk.sort_values('unique_mtids', ascending=False)
    print(df_high_risk.head(20).to_string(index=False))

IP Comparison: Suspicious (30+ clicks) vs Normal MTIDs

MTID Distribution:
  Normal MTIDs (<30 clicks): 183,157
  Suspicious MTIDs (>=30 clicks): 308

IP Analysis:
  IPs with suspicious MTIDs: 75
  IPs with normal MTIDs only: 80,031
  IPs with BOTH suspicious & normal: 29
  IPs with ONLY suspicious MTIDs: 46

High-Risk IPs (only suspicious MTIDs):
             ip  unique_mtids  unique_ifas    avg_itct_sec
  1.254.164.227             1            1    33226.974684
 223.39.214.172             1            1      7202.69697
   183.97.3.134             1            1      115.342857
   183.97.30.22             1            1       96.514286
  211.185.5.144             1            1      112.097561
 211.215.127.89             1            1     1175.268293
 211.234.226.82             1            1       82.050847
211.251.122.132             1            1         106.575
 211.251.169.50             1            1     1038.257143
   222.236.3.33             1            1    22081.483871
 

In [65]:
#@title Step 9: Summary & Export

print("=" * 60)
print("SUMMARY")
print("=" * 60)

print(f"\n1. Total clicks analyzed: {len(df_itct):,}")
print(f"2. Unique IFAs: {df_itct['ifa'].nunique():,}")
print(f"3. Unique IPs: {df_itct['ip'].nunique():,}")
print(f"4. Unique MTIDs: {df_itct['mtid'].nunique():,}")
print(f"5. Median ITCT: {df_itct['itct_sec'].median():.2f} seconds")
print(f"6. Suspicious MTIDs ({MIN_CLICK_THRESHOLD}+ clicks): {df_mtid_click_count['is_suspicious'].sum():,}")

# Export to Excel
output_file = 'ODSB-15998_ITCT_analysis_results.xlsx'
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df_ip_clicks.to_excel(writer, sheet_name='IP_Click_Summary', index=False)
    df_ip_mtid_pattern.to_excel(writer, sheet_name='IP_MTID_Pattern', index=False)
    df_pub_ip.head(100).to_excel(writer, sheet_name='Publisher_IP_Top100', index=False)
    df_mtid_click_count.to_excel(writer, sheet_name='MTID_Click_Counts', index=False)
    if len(suspicious_only_ips) > 0:
        df_high_risk.to_excel(writer, sheet_name='High_Risk_IPs', index=False)

print(f"\nResults exported to: {output_file}")

SUMMARY

1. Total clicks analyzed: 387,988
2. Unique IFAs: 77,177
3. Unique IPs: 80,106
4. Unique MTIDs: 183,465
5. Median ITCT: 38.00 seconds
6. Suspicious MTIDs (30+ clicks): 308

Results exported to: ODSB-15998_ITCT_analysis_results.xlsx
