In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import requests
import time
import numpy as np

## load data

In [4]:
meta_data_df = pd.read_csv('account-sharing-export/user_metadata.csv')
meta_data_df.head()

Unnamed: 0,user_id,registration_country,registration_region,registration_lat,registration_lng,timezone,registered_at,first_seen_at,acquisition_source
0,8e6fa8be-bc27-40b7-8e37-f85e706a4a79,US,MI,42.808361,-85.14312,,2025-10-08 17:00:31,,google-ads
1,f3ebdaaf-470c-4964-8a08-0b20174d4b04,CA,ON,43.01088,-81.277458,,2025-10-15 16:44:55,,google-ads
2,4b7a49f9-27b8-44a7-bfd3-3e64815a6ebe,US,CA,34.0476,-118.292267,,2024-11-10 05:45:27,,a10
3,e8722f11-ad5d-4911-8798-240e8e2e7503,US,PA,40.429089,-75.346222,,2025-10-02 02:47:05,,
4,7cdf5b27-914c-42be-8753-46ae17f148a1,US,NH,43.230541,-71.547981,,2023-05-05 07:18:53,2023-05-05 07:14:19,


In [5]:
path = Path("account-sharing-export/auth_events.csv")

rows = []
with path.open("r", encoding="utf-8", errors="replace") as f:
    header = f.readline().rstrip("\n").split(",")
    assert header == ["user_id","event_type","event_timestamp","ip_address","user_agent"]

    for line in f:
        parts = line.rstrip("\n")
        if not parts:
            continue

        # split on first 4 commas only
        user_id, event_type, event_timestamp, ip_address, user_agent = parts.split(",", 4)
        rows.append((user_id, event_type, event_timestamp, ip_address, user_agent))

auth_events_df = pd.DataFrame(rows, columns=header)

auth_events_df.head()

Unnamed: 0,user_id,event_type,event_timestamp,ip_address,user_agent
0,a6d983a8-133f-4287-a47d-aabb167bdd49,forgotPassword,2025-12-09 04:31:49,130.65.254.15|10.3.3.254,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...
1,e79ea4fd-84a0-49e9-a923-74100e822801,forgotPassword,2025-07-18 16:13:49,171.158.162.34|10.3.2.180,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2,8b7826d1-9011-4deb-8351-b324a0531589,forgotPassword,2025-08-01 14:16:01,47.17.253.179|10.3.2.77,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3,7f620264-b87d-4c81-acf6-ff1f1375e951,forgotPassword,2025-10-12 22:27:46,70.113.153.4|10.3.3.22,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4,942159f6-673d-4d8b-9ec9-5ca4276ae4f7,forgotPassword,2025-09-16 03:19:42,68.2.160.173|10.3.3.80,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [6]:
meta_data_df['registration_lat'] = pd.to_numeric(
    meta_data_df['registration_lat'], errors='coerce'
)
meta_data_df['registration_lng'] = pd.to_numeric(
    meta_data_df['registration_lng'], errors='coerce'
)

In [7]:
path = Path("account-sharing-export/content_activity.csv")

rows = []
with path.open("r", encoding="utf-8", errors="replace") as f:
    header = f.readline().rstrip("\n").split(",")
    assert header == ["user_id", "activity_timestamp", "ip_address", "user_agent"], header

    for line in f:
        line = line.rstrip("\n")
        if not line:
            continue

        # split on first 3 commas only (user_agent may contain commas)
        user_id, activity_timestamp, ip_address, user_agent = line.split(",", 3)
        rows.append((user_id, activity_timestamp, ip_address, user_agent))

content_activity_df = pd.DataFrame(rows, columns=header)
content_activity_df["activity_timestamp"] = pd.to_datetime(content_activity_df["activity_timestamp"], errors="coerce")

content_activity_df.head()

Unnamed: 0,user_id,activity_timestamp,ip_address,user_agent
0,fb65cf6f-bb87-4165-8397-f97353971bad,2025-11-08 22:18:47,24.12.79.10|10.3.2.78,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,95b27553-ded6-46b8-a030-d2a8ded1c499,2025-08-29 20:04:49,23.234.103.241|10.3.2.45,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...
2,c1c27c21-b61c-4051-9144-a9738477ba7f,2025-06-17 04:13:19,69.112.167.87|10.3.3.158,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3,54136626-f7f5-41e1-bbd4-3d2f971814c4,2025-05-30 23:20:10,76.34.105.150|10.3.2.140,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4,826d9a5a-4d06-4bc9-b1fa-4722965ce300,2025-09-11 05:50:53,72.89.26.28|10.3.2.16,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...


In [8]:
def public_ip(ip_str: str) -> str:
    if pd.isna(ip_str):
        return None
    return str(ip_str).split("|", 1)[0].strip()

auth_events_df["public_ip"] = auth_events_df["ip_address"].apply(public_ip)
content_activity_df["public_ip"] = content_activity_df["ip_address"].apply(public_ip)

In [9]:
auth_geo = auth_events_df[["user_id", "event_timestamp", "public_ip"]].copy()
auth_geo = auth_geo.rename(columns={"event_timestamp": "timestamp"})
auth_geo["timestamp"] = pd.to_datetime(auth_geo["timestamp"], errors="coerce")
auth_geo["source"] = "auth"

content_geo = content_activity_df[["user_id", "activity_timestamp", "public_ip"]].copy()
content_geo = content_geo.rename(columns={"activity_timestamp": "timestamp"})
content_geo["timestamp"] = pd.to_datetime(content_geo["timestamp"], errors="coerce")
content_geo["source"] = "content"

geo_events = pd.concat([auth_geo, content_geo], ignore_index=True)
geo_events = geo_events.dropna(subset=["user_id", "timestamp", "public_ip"])
geo_events.head()

Unnamed: 0,user_id,timestamp,public_ip,source
0,a6d983a8-133f-4287-a47d-aabb167bdd49,2025-12-09 04:31:49,130.65.254.15,auth
1,e79ea4fd-84a0-49e9-a923-74100e822801,2025-07-18 16:13:49,171.158.162.34,auth
2,8b7826d1-9011-4deb-8351-b324a0531589,2025-08-01 14:16:01,47.17.253.179,auth
3,7f620264-b87d-4c81-acf6-ff1f1375e951,2025-10-12 22:27:46,70.113.153.4,auth
4,942159f6-673d-4d8b-9ec9-5ca4276ae4f7,2025-09-16 03:19:42,68.2.160.173,auth


In [10]:
unique_ips = geo_events["public_ip"].dropna().drop_duplicates()
print("unique public IPs:", len(unique_ips))

unique public IPs: 91104


## get long lat

In [11]:
import geoip2.database
import pandas as pd

MMDB_PATH = "./GeoLite2-Country_20260224/GeoLite2-Country.mmdb"

reader = geoip2.database.Reader(MMDB_PATH)

ip_rows = []
for ip in unique_ips:
    try:
        r = reader.country(ip)
        cc = r.country.iso_code
    except Exception:
        cc = None
    ip_rows.append((ip, cc))

ip_geo_df = pd.DataFrame(ip_rows, columns=["public_ip", "country_code"])
ip_geo_df.head()

Unnamed: 0,public_ip,country_code
0,130.65.254.15,US
1,171.158.162.34,US
2,47.17.253.179,US
3,70.113.153.4,US
4,68.2.160.173,US


In [12]:
# ip_geo_df columns: public_ip, country_code
geo_events = geo_events.merge(ip_geo_df[["public_ip", "country_code"]], on="public_ip", how="left")
geo_events = geo_events.dropna(subset=["country_code"])

## Country entropy feature

In [15]:
import numpy as np
import pandas as pd

# geo_events must have: user_id, timestamp, country_code
geo_events = geo_events.dropna(subset=["user_id", "timestamp", "country_code"]).copy()
geo_events["timestamp"] = pd.to_datetime(geo_events["timestamp"], errors="coerce")
geo_events = geo_events.dropna(subset=["timestamp"])

# --- 1) DAILY AGGREGATION (key for content_activity volume) ---
geo_events["day"] = geo_events["timestamp"].dt.date

# one record per user-country-day
daily = geo_events.drop_duplicates(["user_id", "country_code", "day"]).copy()

# --- 2) COUNTRY DISTRIBUTION PER USER ---
counts = (
    daily.groupby(["user_id", "country_code"])
    .size()
    .reset_index(name="n_days")
)

totals = counts.groupby("user_id")["n_days"].sum().reset_index(name="total_days")
counts = counts.merge(totals, on="user_id", how="left")
counts["p"] = counts["n_days"] / counts["total_days"]

# Ensure numeric dtypes
counts["n_days"] = pd.to_numeric(counts["n_days"], errors="coerce")
counts["total_days"] = pd.to_numeric(counts["total_days"], errors="coerce")
counts["p"] = counts["n_days"] / counts["total_days"]
counts["p"] = pd.to_numeric(counts["p"], errors="coerce")
counts = counts.dropna(subset=["p"])

# Shannon entropy H = -sum(p log p)
counts["p_log_p"] = counts["p"] * np.log(counts["p"])
entropy = counts.groupby("user_id")["p_log_p"].sum().mul(-1).reset_index(name="geo_entropy")

unique = (
    counts.groupby("user_id")["country_code"]
    .nunique()
    .reset_index(name="unique_countries")
)

features = entropy.merge(unique, on="user_id", how="left")

# normalized entropy in [0,1]
features["geo_entropy_norm"] = np.where(
    features["unique_countries"] > 1,
    features["geo_entropy"] / np.log(features["unique_countries"]),
    0.0
)

# effective number of countries (entropy -> "equivalent evenly-used countries")
features["effective_countries"] = np.exp(features["geo_entropy"])

# --- 3) SWITCHING / BACK-AND-FORTH PATTERNS (daily sequence) ---
# Reduce to one country per user-day. If multiple countries in same day, keep the most frequent (rare, but possible).
day_country = (
    geo_events.groupby(["user_id", "day", "country_code"])
    .size()
    .reset_index(name="k")
    .sort_values(["user_id", "day", "k"], ascending=[True, True, False])
    .drop_duplicates(["user_id", "day"])
    .sort_values(["user_id", "day"])
)

day_country["prev"] = day_country.groupby("user_id")["country_code"].shift(1)
day_country["prev2"] = day_country.groupby("user_id")["country_code"].shift(2)

day_country["is_switch"] = (day_country["country_code"] != day_country["prev"]).astype(float)
day_country.loc[day_country["prev"].isna(), "is_switch"] = np.nan

day_country["is_alt"] = (
    (day_country["country_code"] == day_country["prev2"]) &
    (day_country["country_code"] != day_country["prev"])
).astype(float)
day_country.loc[day_country["prev2"].isna(), "is_alt"] = np.nan

switch_rate = (
    day_country.groupby("user_id")["is_switch"]
    .mean()
    .reset_index(name="country_switch_rate")
)
alt_rate = (
    day_country.groupby("user_id")["is_alt"]
    .mean()
    .reset_index(name="country_alternation_rate")
)

features = (
    features.merge(switch_rate, on="user_id", how="left")
            .merge(alt_rate, on="user_id", how="left")
)

features = features.fillna({"country_switch_rate": 0.0, "country_alternation_rate": 0.0})

features.head()

Unnamed: 0,user_id,geo_entropy,unique_countries,geo_entropy_norm,effective_countries,country_switch_rate,country_alternation_rate
0,0002211b-6e0c-4bd2-96b9-33c200b9efc4,-0.0,1,0.0,1.0,0.0,0.0
1,00022407-82a1-441a-8925-70c416026784,-0.0,1,0.0,1.0,0.0,0.0
2,0006d178-29aa-4668-bcb5-efefd319f056,-0.0,1,0.0,1.0,0.0,0.0
3,00083cfe-429c-4d7e-8f59-3b0ceee503e8,-0.0,1,0.0,1.0,0.0,0.0
4,000aee50-8ff0-4f5e-a15c-98dcbcdf52b9,-0.0,1,0.0,1.0,0.0,0.0


In [16]:
features["unique_countries"].describe()

count    19801.000000
mean         1.046816
std          0.274837
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          9.000000
Name: unique_countries, dtype: float64

In [17]:
features["unique_countries"].value_counts().sort_index()

unique_countries
1    19071
2      590
3      109
4       18
5        8
6        1
7        1
8        2
9        1
Name: count, dtype: int64

In [19]:
features[features["unique_countries"] == 9]

Unnamed: 0,user_id,geo_entropy,unique_countries,geo_entropy_norm,effective_countries,country_switch_rate,country_alternation_rate
9707,7cbba682-9b28-40a6-8ac0-2205d15372e3,1.989435,9,0.905431,7.3114,0.439394,0.092308


In [20]:
user_9 = features[features["unique_countries"] == 9]["user_id"].iloc[0]

geo_events[geo_events["user_id"] == user_9] \
    .groupby("country_code") \
    .size() \
    .sort_values(ascending=False)

country_code
ES    159
GB    101
CA     88
ZA     21
MX     17
US     11
DK      9
MU      4
AE      2
dtype: int64

In [21]:
features["geo_score"] = (
    0.5 * features["geo_entropy_norm"] +
    0.3 * (features["unique_countries"] / features["unique_countries"].max()) +
    0.2 * features["country_alternation_rate"]
)

features.sort_values("geo_score", ascending=False).head(20)

Unnamed: 0,user_id,geo_entropy,unique_countries,geo_entropy_norm,effective_countries,country_switch_rate,country_alternation_rate,geo_score
9707,7cbba682-9b28-40a6-8ac0-2205d15372e3,1.989435,9,0.905431,7.3114,0.439394,0.092308,0.771177
213,027e0acb-df57-480f-be74-458303109224,0.693147,2,1.0,2.0,1.0,1.0,0.766667
4761,3d5acbc4-5328-432b-b0ca-ca810814e553,0.636514,2,0.918296,1.889882,1.0,1.0,0.725815
12665,a345701d-702c-4183-9c14-bd50f37748d0,0.636514,2,0.918296,1.889882,1.0,1.0,0.725815
8014,67ae46d9-5642-4497-967d-ae8c77a3bc9b,0.636514,2,0.918296,1.889882,1.0,1.0,0.725815
2631,214836df-6cb7-46a3-979b-e5290eb4bb5f,1.53253,6,0.855321,4.629876,0.647059,0.3125,0.690161
9801,7dd2d410-d66b-4fd2-83c5-7572f878df9f,0.636514,2,0.918296,1.889882,0.8,0.75,0.675815
3345,2acc087b-f777-4f84-899f-36e2db381f58,0.682908,2,0.985228,1.979626,0.666667,0.5,0.659281
4096,34c91d7e-598e-4074-99b8-b761793eee94,1.608853,8,0.773695,4.997075,0.190476,0.0,0.653514
11375,927ffdc4-9e30-4fc0-90ea-1762c6791b66,0.9557,3,0.869916,2.60049,0.666667,0.5,0.634958


In [18]:
features.sort_values("geo_entropy_norm", ascending=False).head(20)

Unnamed: 0,user_id,geo_entropy,unique_countries,geo_entropy_norm,effective_countries,country_switch_rate,country_alternation_rate
14932,c0afa16d-d0d0-4ab2-a5f0-e34701aff93c,0.693147,2,1.0,2.0,0.333333,0.0
5572,481a62ae-9e8d-417a-82d2-2c0b3fcc8d53,0.693147,2,1.0,2.0,0.266667,0.071429
17442,e22fa3f9-ef70-4c61-bc39-abb3ad9f4028,0.693147,2,1.0,2.0,0.5,0.0
5215,436269dc-00ce-4a45-bbb7-10cbb1826c2b,0.693147,2,1.0,2.0,1.0,0.0
15627,c9bbbf01-4832-45ec-8f39-dc3eeb3c394c,0.693147,2,1.0,2.0,1.0,0.0
16657,d7a8126c-7818-4f47-ba50-fbec4e539733,0.693147,2,1.0,2.0,1.0,0.0
18229,ec4f0ad2-a428-4ecc-8f48-9c9c8e3e4a8a,0.693147,2,1.0,2.0,0.0,0.0
7803,64d82630-e9e0-45d0-9f95-e744917bfbe0,0.693147,2,1.0,2.0,1.0,0.0
6483,541a1a60-7207-46ad-9e4b-8929c9895473,0.693147,2,1.0,2.0,0.333333,0.0
483,05fb74d0-b7d8-4a8e-83f5-7cece8a29790,0.693147,2,1.0,2.0,1.0,0.0


In [25]:
features["geo_flag"] = (
    (features["unique_countries"] >= 4) |
    (
        (features["unique_countries"] >= 3) &
        (features["geo_entropy_norm"] >= 0.7)
    ) |
    (
        (features["country_switch_rate"] >= 0.7) &
        (features["unique_countries"] >= 2)
    )
)

In [26]:
features["geo_flag"].mean()

np.float64(0.004646229988384425)

In [27]:
flagged_users = (
    features[features["geo_flag"]]
    .sort_values("geo_entropy_norm", ascending=False)
    [[
        "user_id",
        "unique_countries",
        "geo_entropy_norm",
        "effective_countries",
        "country_switch_rate",
        "country_alternation_rate"
    ]]
    .reset_index(drop=True)
)

flagged_users.head(20)

Unnamed: 0,user_id,unique_countries,geo_entropy_norm,effective_countries,country_switch_rate,country_alternation_rate
0,024af674-2b8a-4a55-bc03-fd0e1b1ad6ab,2,1.0,2.0,1.0,0.0
1,3f059fd8-1a1a-4bef-b24c-931f582bd2bb,2,1.0,2.0,1.0,0.0
2,fb070549-3460-42c1-8d33-9e6bf30dbed5,2,1.0,2.0,1.0,0.0
3,f85cc520-8a9a-4646-90a3-a49ccb0a41d6,2,1.0,2.0,1.0,0.0
4,e805185e-f0dd-4956-8021-b5a1af62d8d1,2,1.0,2.0,1.0,0.0
5,e2472d10-d354-4cbc-8dbb-7b0812b976c3,2,1.0,2.0,1.0,0.0
6,d7a8126c-7818-4f47-ba50-fbec4e539733,2,1.0,2.0,1.0,0.0
7,c9bbbf01-4832-45ec-8f39-dc3eeb3c394c,2,1.0,2.0,1.0,0.0
8,bfb0cac7-e64b-4376-a1bc-5342ce1c427d,2,1.0,2.0,1.0,0.0
9,a32f77d6-2a21-4b8f-8d54-1a0289137dad,2,1.0,2.0,1.0,0.0


In [28]:
print("Flagged users:", len(flagged_users))
print("Percent of total:", round(len(flagged_users) / len(features) * 100, 2), "%")

Flagged users: 92
Percent of total: 0.46 %


In [29]:
flagged_users.to_csv("flagged_geo_entropy_users.csv", index=False)