<a href="https://colab.research.google.com/github/inbalv/tictactoe/blob/master/churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. User Demographics and Acquisition Data
User Demographics:
• Age, gender, location, language, and other available profile information.

# Acquisition Details:
• Acquisition channel (e.g., organic, paid, referral), campaign identifier, signup date/time, and initial user segment.

# 2. Behavioral Metrics
Usage Frequency:
• Number of sessions per day/week, average session frequency over a set period.

# Session Duration:
• Average session length and distribution of session durations.

# Recency:
• Time elapsed since the last session or interaction.

# Engagement Patterns:
• Time between sessions, consistency of usage (e.g., variance in daily or weekly activity).

# 3. In-App Engagement Features
Feature-Specific Interactions:
• Counts of interactions with key app features (e.g., tutorial completions, level-ups, social interactions, or game-specific actions).

# Event Metrics:
• Number of specific in-app events (e.g., clicks, swipes, achievements) that correlate with engagement.

# Conversion Events:
• Occurrence of meaningful actions (e.g., in-app purchases, ad clicks) if applicable.

# 4. Device and Technical Attributes
Device Information:
• Device type (mobile, tablet), operating system, app version, and hardware specifications.

# Technical Performance:
• App load times, crash logs, and error frequencies that might affect user experience.

5. Temporal and Contextual Features
Time-Related Patterns:
• Day of week or time of day usage trends, seasonality effects, and changes in user behavior around specific events or promotions.

# Environmental Context:
• External factors like network conditions or concurrent marketing campaigns.

# 6. Additional Behavioral and Social Indicators
Social Interactions:
• In-app messaging, friend referrals, or interactions with community features.

# Support Engagement:
• Frequency of customer support contacts or reported issues, which can be an early indicator of dissatisfaction.





features list :

 age, location, language, country , device_type(phone, tablet), os, device_pos(vertical or horizontl) , is_cc_inapp,  Acquisition_type(organic, referral, paid), campain_id, sign_up_date,
n_sessions_per_day, n_sessions_week, average_session_length, std_session_durations, time_since_last_session, social_interactions, playing_with_friends, playing_with_close_friends, friend_leaving, , average_time_between_weekly_sessions, total_in_app_purchases, clicks_events,seasonality(morning, evenings, weekends), n_tutorial_completions, level_in_game, special_points, special_gifts, achievements, app_rank, app_review.


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def label_churn(df: pd.DataFrame, last_active_col: str, threshold_days: int = 30, ref_date: str = None) -> pd.DataFrame:
    """
    Labels users as churned if the number of days since their last activity exceeds a threshold.

    Parameters:
        df (pd.DataFrame): DataFrame containing user activity data.
        last_active_col (str): Name of the column with last active dates.
        threshold_days (int, optional): Number of days without activity after which a user is considered churned. Defaults to 30.
        ref_date (str, optional): Reference date in 'YYYY-MM-DD' format. If None, the max date in last_active_col is used.

    Returns:
        pd.DataFrame: A copy of the DataFrame with two new columns:
                      - 'days_since_last_active': Number of days since the user was last active.
                      - 'churn': Binary flag (1 for churned, 0 for active).
    """
    # Ensure the date column is in datetime format
    df[last_active_col] = pd.to_datetime(df[last_active_col])

    # Determine the reference date (defaults to the most recent date in the data)
    if ref_date is None:
        ref_date_dt = df[last_active_col].max()
    else:
        ref_date_dt = pd.to_datetime(ref_date)

    # Calculate days since last active
    df['days_since_last_active'] = (ref_date_dt - df[last_active_col]).dt.days

    # Label churn: 1 if the days since last active exceed the threshold, otherwise 0
    df['churn'] = np.where(df['days_since_last_active'] > threshold_days, 1, 0)

    return df

def evaluate_churn_model(y_true, y_pred, y_prob):
    """
    Evaluates the performance of a churn classification model.

    Parameters:
        y_true (array-like): True binary labels (0 for active, 1 for churned).
        y_pred (array-like): Predicted binary labels.
        y_prob (array-like): Predicted probabilities for the positive class (churn).

    Returns:
        dict: Dictionary containing the following evaluation metrics:
              - accuracy
              - precision
              - recall
              - f1_score
              - roc_auc
    """
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob)
    }
    return metrics

# Example usage:
if __name__ == "__main__":
    # Create a sample dataset
    data = {
        'user_id': [1, 2, 3, 4],
        'last_active_date': ['2025-03-01', '2025-03-15', '2025-04-01', '2025-02-28']
    }
    df_users = pd.DataFrame(data)

    # Label churn using a threshold of 30 days
    df_labeled = label_churn(df_users, last_active_col='last_active_date', threshold_days=30, ref_date='2025-04-15')
    print(df_labeled)

    # Suppose these are the model predictions for demonstration
    y_true = [0, 0, 0, 1]  # Actual labels
    y_pred = [0, 1, 0, 1]  # Predicted labels
    y_prob = [0.2, 0.7, 0.1, 0.9]  # Predicted probabilities for churn

    # Evaluate model performance
    metrics = evaluate_churn_model(y_true, y_pred, y_prob)
    print(metrics)


In [None]:
1. User Demographics and Acquisition Data
User Demographics:
• Age, gender, location, language, and other available profile information.

Acquisition Details:
• Acquisition channel (e.g., organic, paid, referral), campaign identifier, signup date/time, and initial user segment.

2. Behavioral Metrics
Usage Frequency:
• Number of sessions per day/week, average session frequency over a set period.

Session Duration:
• Average session length and distribution of session durations.

Recency:
• Time elapsed since the last session or interaction.

Engagement Patterns:
• Time between sessions, consistency of usage (e.g., variance in daily or weekly activity).

3. In-App Engagement Features
Feature-Specific Interactions:
• Counts of interactions with key app features (e.g., tutorial completions, level-ups, social interactions, or game-specific actions).

Event Metrics:
• Number of specific in-app events (e.g., clicks, swipes, achievements) that correlate with engagement.

Conversion Events:
• Occurrence of meaningful actions (e.g., in-app purchases, ad clicks) if applicable.

4. Device and Technical Attributes
Device Information:
• Device type (mobile, tablet), operating system, app version, and hardware specifications.

Technical Performance:
• App load times, crash logs, and error frequencies that might affect user experience.

5. Temporal and Contextual Features
Time-Related Patterns:
• Day of week or time of day usage trends, seasonality effects, and changes in user behavior around specific events or promotions.

Environmental Context:
• External factors like network conditions or concurrent marketing campaigns.

6. Additional Behavioral and Social Indicators
Social Interactions:
• In-app messaging, friend referrals, or interactions with community features.

Support Engagement:
• Frequency of customer support contacts or reported issues, which can be an early indicator of dissatisfaction.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def count_events_before_session(df, user_col="user_id", time_col="event_time", event_type_col="event_type"):
    """
    For each event, computes a cumulative count of events per user ordered by event_time.
    Then, for session start events, calculates the number of events that occurred before that session.

    Parameters:
        df (DataFrame): Input Spark DataFrame with event records.
        user_col (str): Column name representing the user. Defaults to "user_id".
        time_col (str): Column name containing the event timestamp. Defaults to "event_time".
        event_type_col (str): Column name representing the event type. Defaults to "event_type".
        session_value (str): The value in event_type_col that indicates a session start. Defaults to "session_start".

    Returns:
        DataFrame: The original DataFrame with two additional columns:
                   - 'cum_count': Cumulative count of events per user.
                   - 'events_before_session': For session start events, the count of events that occurred prior to that session.
                     For non-session events, this column will be null.
    """
    # Define a window partitioned by user and ordered by event_time
    window_spec = Window.partitionBy(user_col,event_type_col).orderBy(F.col(time_col))

    # Compute the cumulative count of events for each user
    df_with_cum = df.withColumn("cum_count", F.count("*").over(window_spec) - 1)


    return df_with_events_before




In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
from functools import reduce
from pyspark.sql import DataFrame

# Create a Spark session
spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()

# -----------------------------------------------------------------------------
# Example: Base User Data (Demographics, Device, Acquisition, etc.)
# -----------------------------------------------------------------------------
data_users = [
    # user_id, age, location, language, country, device_type, os, device_pos, is_cc_inapp, acquisition_type, campaign_id, sign_up_date, app_rank, app_review
    (1, 25, "New York", "English", "USA", "phone", "iOS", "vertical", True, "organic", "camp1", "2021-01-01", 10, 4.5),
    (2, 30, "Los Angeles", "English", "USA", "tablet", "Android", "horizontal", False, "paid", "camp2", "2021-02-01", 8, 4.2)
]
columns_users = ["user_id", "age", "location", "language", "country", "device_type", "os", "device_pos",
                 "is_cc_inapp", "acquisition_type", "campaign_id", "sign_up_date", "app_rank", "app_review"]

df_users = spark.createDataFrame(data_users, columns_users)
df_users = df_users.withColumn("sign_up_date", F.to_date("sign_up_date", "yyyy-MM-dd"))

# -----------------------------------------------------------------------------
# Example: Raw Event Data (Session, Clicks, Purchases, Social, etc.)
# -----------------------------------------------------------------------------
# For simplicity, we assume that the events dataframe contains a column 'event_time' (timestamp),
# 'event_type' (string identifying the type), and additional columns when needed.
data_events = [
    # user_id, event_time, event_type, session_start_time, session_end_time, points, level
    (1, "2021-03-01 10:00:00", "session_start", "2021-03-01 10:00:00", "2021-03-01 10:30:00", None, None),
    (1, "2021-03-01 10:05:00", "click", None, None, None, None),
    (1, "2021-03-01 11:00:00", "session_start", "2021-03-01 11:00:00", "2021-03-01 11:25:00", None, None),
    (1, "2021-03-01 11:30:00", "social_interaction", None, None, None, None),
    (1, "2021-03-01 12:00:00", "tutorial_completion", None, None, None, None),
    (2, "2021-03-02 09:00:00", "session_start", "2021-03-02 09:00:00", "2021-03-02 09:45:00", None, None),
    (2, "2021-03-02 09:50:00", "in_app_purchase", None, None, None, None),
    (2, "2021-03-02 10:15:00", "session_start", "2021-03-02 10:15:00", "2021-03-02 10:35:00", None, None),
    (2, "2021-03-02 10:40:00", "tutorial_completion", None, None, None, None)
]
columns_events = ["user_id", "event_time", "event_type", "session_start_time", "session_end_time", "points", "level"]
df_events = spark.createDataFrame(data_events, columns_events)
df_events = df_events.withColumn("event_time", F.to_timestamp("event_time", "yyyy-MM-dd HH:mm:ss"))
df_events = df_events.withColumn("session_start_time", F.to_timestamp("session_start_time", "yyyy-MM-dd HH:mm:ss"))
df_events = df_events.withColumn("session_end_time", F.to_timestamp("session_end_time", "yyyy-MM-dd HH:mm:ss"))

# -----------------------------------------------------------------------------
# Feature Engineering: Event-Based Aggregations
# -----------------------------------------------------------------------------

# (A) Sessions-Related Features ------------------------------------------------

# Filter session start events
df_sessions = df_events.filter(F.col("event_type") == "session_start")
df_sessions = df_sessions.withColumn("session_date", F.to_date("event_time"))
df_sessions = df_sessions.withColumn("session_week", F.weekofyear("session_date"))

# 1. Average sessions per day
sessions_per_day = df_sessions.groupBy("user_id", "session_date") \
                              .agg(F.count("*").alias("sessions_count"))
avg_sessions_per_day = sessions_per_day.groupBy("user_id") \
                                       .agg(F.avg("sessions_count").alias("n_sessions_per_day"))

# 2. Average sessions per week
sessions_per_week = df_sessions.groupBy("user_id", "session_week") \
                               .agg(F.count("*").alias("sessions_count"))
avg_sessions_per_week = sessions_per_week.groupBy("user_id") \
                                         .agg(F.avg("sessions_count").alias("n_sessions_week"))

# 3. Average and standard deviation of session lengths
df_sessions = df_sessions.withColumn("session_length",
                                     F.unix_timestamp("session_end_time") - F.unix_timestamp("session_start_time"))
session_length_stats = df_sessions.groupBy("user_id") \
                                  .agg(F.avg("session_length").alias("average_session_length"),
                                       F.stddev("session_length").alias("std_session_durations"))

# 4. Time since last session (in seconds)
last_session = df_sessions.groupBy("user_id") \
                          .agg(F.max("event_time").alias("last_session_time"))
last_session = last_session.withColumn("time_since_last_session",
                                       F.unix_timestamp(F.current_timestamp()) - F.unix_timestamp("last_session_time"))

# 5. Average time between sessions per user
window_spec = Window.partitionBy("user_id").orderBy("event_time")
df_sessions = df_sessions.withColumn("prev_session_time", F.lag("event_time").over(window_spec))
df_sessions = df_sessions.withColumn("time_diff",
                                     F.unix_timestamp("event_time") - F.unix_timestamp("prev_session_time"))
avg_time_between_sessions = df_sessions.groupBy("user_id") \
                                       .agg(F.avg("time_diff").alias("average_time_between_weekly_sessions"))

# (B) Other Event-Based Features -----------------------------------------------

# 6. Social interactions
social_interactions = df_events.filter(F.col("event_type") == "social_interaction") \
                               .groupBy("user_id").agg(F.count("*").alias("social_interactions"))

# 7. Playing with friends (assuming event_type indicates such events)
playing_with_friends = df_events.filter(F.col("event_type") == "playing_with_friends") \
                                .groupBy("user_id").agg(F.count("*").alias("playing_with_friends"))

# 8. Playing with close friends
playing_with_close_friends = df_events.filter(F.col("event_type") == "playing_with_close_friends") \
                                      .groupBy("user_id").agg(F.count("*").alias("playing_with_close_friends"))

# 9. Friend leaving events
friend_leaving = df_events.filter(F.col("event_type") == "friend_leaving") \
                          .groupBy("user_id").agg(F.count("*").alias("friend_leaving"))

# 10. Total in-app purchases
total_in_app_purchases = df_events.filter(F.col("event_type") == "in_app_purchase") \
                                  .groupBy("user_id").agg(F.count("*").alias("total_in_app_purchases"))

# 11. Click events
clicks_events = df_events.filter(F.col("event_type") == "click") \
                         .groupBy("user_id").agg(F.count("*").alias("clicks_events"))

# 12. Seasonality features: counts of sessions during morning, evening, and weekends.
df_sessions = df_sessions.withColumn("hour", F.hour("event_time"))
df_sessions = df_sessions.withColumn("day_of_week", F.date_format("event_time", "E"))
morning_sessions = df_sessions.filter((F.col("hour") >= 5) & (F.col("hour") < 12)) \
                              .groupBy("user_id").agg(F.count("*").alias("morning_sessions"))
evening_sessions = df_sessions.filter((F.col("hour") >= 18) & (F.col("hour") < 24)) \
                              .groupBy("user_id").agg(F.count("*").alias("evening_sessions"))
weekend_sessions = df_sessions.filter(F.col("day_of_week").isin("Sat", "Sun")) \
                              .groupBy("user_id").agg(F.count("*").alias("weekend_sessions"))

# 13. Number of tutorial completions
tutorial_completions = df_events.filter(F.col("event_type") == "tutorial_completion") \
                                .groupBy("user_id").agg(F.count("*").alias("n_tutorial_completions"))

# 14. Level in game (e.g., maximum level reached via level_up events)
level_in_game = df_events.filter(F.col("event_type") == "level_up") \
                         .groupBy("user_id").agg(F.max("level").alias("level_in_game"))

# 15. Special points (sum over events carrying a 'points' value for special_points events)
special_points = df_events.filter(F.col("event_type") == "special_points") \
                          .groupBy("user_id").agg(F.sum("points").alias("special_points"))

# 16. Special gifts count
special_gifts = df_events.filter(F.col("event_type") == "special_gift") \
                         .groupBy("user_id").agg(F.count("*").alias("special_gifts"))

# 17. Achievements count
achievements = df_events.filter(F.col("event_type") == "achievement") \
                        .groupBy("user_id").agg(F.count("*").alias("achievements"))

# -----------------------------------------------------------------------------
# Join All Features Together
# -----------------------------------------------------------------------------

# List of DataFrames to join on 'user_id'
dfs = [
    df_users,
    avg_sessions_per_day,
    avg_sessions_per_week,
    session_length_stats,
    last_session.select("user_id", "time_since_last_session"),
    social_interactions,
    playing_with_friends,
    playing_with_close_friends,
    friend_leaving,
    avg_time_between_sessions,
    total_in_app_purchases,
    clicks_events,
    morning_sessions,
    evening_sessions,
    weekend_sessions,
    tutorial_completions,
    level_in_game,
    special_points,
    special_gifts,
    achievements
]

def join_dfs(df1: DataFrame, df2: DataFrame) -> DataFrame:
    return df1.join(df2, on="user_id", how="left")

final_features = reduce(join_dfs, dfs)

# Display the final features DataFrame
final_features.orderBy("user_id").show(truncate=False)
