# 📊 Retention Analysis

This notebook performs data cleaning and cohort-based retention analysis using simulated mobile app usage data.

## 📁 Step 1: Load Raw Data

In [4]:
import pandas as pd

# Load the raw dataset
raw_df = pd.read_csv("/Users/yoey_danjuma/mobile-app-retention-analysis/data/raw_user_events.csv")
raw_df.head()

Unnamed: 0,user_id,install_date,event_date,event_type
0,U00001,2024-02-21,2024-03-04,app_open
1,U00001,2024-02-21,2024-02-28,app_open
2,U00001,2024-02-21,2024-03-14,app_open
3,U00002,2024-01-15,2024-01-28,app_open
4,U00003,2024-03-12,2024-03-30,app_open


## 🧹 Step 2: Data Cleaning & Transformation

### 🔸 2.1 Check the structure of the raw data

In [None]:
raw_df.info()
raw_df.describe(include='all')
raw_df.head()

### 🔸 2.2 Check for missing or duplicated data

In [None]:
print("Missing values:\n", raw_df.isnull().sum())
print("Duplicates:", raw_df.duplicated().sum())

### 🔸 2.3 Convert date columns to datetime

In [None]:
raw_df['install_date'] = pd.to_datetime(raw_df['install_date'])
raw_df['event_date'] = pd.to_datetime(raw_df['event_date'])

### 🔸 2.4 Sanity check: No event before install

In [None]:
invalid_events = raw_df[raw_df['event_date'] < raw_df['install_date']]
print(f"Invalid events found: {len(invalid_events)}")

### 🔸 2.5 Add `cohort_date`

In [None]:
raw_df['cohort_date'] = raw_df['install_date']

### 🔸 2.6 Calculate `days_since_install`

In [None]:
raw_df['days_since_install'] = (raw_df['event_date'] - raw_df['install_date']).dt.days

### 🔸 2.7 Sort data by user and event date

In [None]:
cleaned_df = raw_df.sort_values(by=['user_id', 'event_date'])
cleaned_df.head()

### 🔸 2.8 Save cleaned data

In [None]:
cleaned_df.to_csv('data/cleaned_retention_data.csv', index=False)
print('Cleaned data saved.')