In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/processed/processed_customer_data_v2.csv')

In [2]:
print(df.info())
print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_id          40000 non-null  object 
 1   signup_date          40000 non-null  object 
 2   last_active_date     40000 non-null  object 
 3   subscription_plan    40000 non-null  object 
 4   monthly_spend        40000 non-null  float64
 5   support_tickets      40000 non-null  int64  
 6   logins_last_30_days  40000 non-null  int64  
 7   feature_usage_score  40000 non-null  float64
 8   industry             40000 non-null  object 
 9   team_size            40000 non-null  int64  
 10  is_churned           40000 non-null  bool   
 11  recency_days         40000 non-null  int64  
 12  frequency_score      40000 non-null  float64
 13  months_active        40000 non-null  int64  
 14  ltv_amount           40000 non-null  float64
 15  frequency_scaled     40000 non-null 

In [3]:
df['segment'] = 'Unclassified'

# Power Users: High engagement & recently active
df['segment'] = np.where(
    (df['engagement_score'] > 0.75) & (df['recency_days'] < 30),
    'Power User',
    df['segment']
)

# At Risk: High engagement but haven’t logged in for 30–120 days
df['segment'] = np.where(
    (df['engagement_score'] > 0.6) & (df['recency_days'] >= 30) & (df['recency_days'] < 120),
    'At Risk',
    df['segment']
)

# Sleeping Giants: High monetary value but low engagement
df['segment'] = np.where(
    (df['engagement_score'] < 0.5) & (df['ltv_amount'] > df['ltv_amount'].quantile(0.75)),
    'Sleeping Giant',
    df['segment']
)

# New Users: Recently signed up (last 2 months)
df['segment'] = np.where(
    df['months_active'] <= 2,
    'New User',
    df['segment']
)

# Already Churned: Directly using existing column
df['segment'] = np.where(
    df['is_churned'] == True,
    'Already Churned',
    df['segment']
)

# Nurture and Grow: Active but not engaging deeply
df['segment'] = np.where(
    (df['segment'] == 'Unclassified') &
    (df['engagement_score'] > 0.3) & 
    (df['recency_days'] < 60),
    'Nurture and Grow', 
    df['segment']
)

# Enterprise vs Self-Serve: Append info to segment
# Differentiating between Self-Serve and Enterprise, keeping the plan info intact
df['segment'] = np.where(
    df['subscription_plan'] == 'Enterprise',
    'Enterprise - ' + df['segment'],
    'Self-Serve - ' + df['segment']
)

In [4]:
print("Segment\n", df['segment'].sample(10))
print(df['segment'].value_counts())


Segment
 33339    Self-Serve - Nurture and Grow
12496    Self-Serve - Nurture and Grow
2208     Self-Serve - Nurture and Grow
4228     Self-Serve - Nurture and Grow
13047    Self-Serve - Nurture and Grow
28953    Self-Serve - Nurture and Grow
21148     Enterprise - Already Churned
26030    Self-Serve - Nurture and Grow
31333    Self-Serve - Nurture and Grow
28396     Self-Serve - Already Churned
Name: segment, dtype: object
segment
Self-Serve - Nurture and Grow    25436
Self-Serve - Already Churned      6885
Enterprise - Nurture and Grow     4400
Enterprise - Already Churned      1156
Self-Serve - New User              785
Self-Serve - At Risk               504
Self-Serve - Power User            351
Enterprise - New User              146
Enterprise - Sleeping Giant        116
Enterprise - At Risk                85
Self-Serve - Sleeping Giant         78
Enterprise - Power User             58
Name: count, dtype: int64
