# Importing dependencies

In [60]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [61]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,user_id,date,vertical,turnover_cash_num,turnover_cash_sum,deposit_approved_num,deposit_approved_sum,withdrawal_approved_num,withdrawal_approved_sum,NGR_sum,session_num,session_sum
0,-4233848618748012971,2020-06-19,sports,4,41.662354,0,0.0,0,0.0,59.875489,0,0.0
1,2814934526452133975,2020-09-14,casino_live,123,847.111687,0,0.0,0,0.0,81.68577,0,0.0
2,3718332242005503191,2020-09-06,casino_live,399,21111.275179,0,0.0,0,0.0,2731.476087,0,0.0
3,-5271561423861853461,2020-09-06,casino_live,60,825.512773,0,0.0,0,0.0,-364.196812,0,0.0
4,-3697770787649913393,2020-07-25,casino_live,13,105.128915,0,0.0,0,0.0,105.128915,0,0.0


In [62]:
data.describe()

Unnamed: 0,user_id,turnover_cash_num,turnover_cash_sum,deposit_approved_num,deposit_approved_sum,withdrawal_approved_num,withdrawal_approved_sum,NGR_sum,session_num,session_sum
count,158216.0,158216.0,158216.0,158216.0,158216.0,158216.0,158216.0,158216.0,158216.0,158216.0
mean,1.503073e+17,517.18349,696.5408,3.052845,110.030905,0.588164,89.435863,26.190897,4.512578,8461.784323
std,5.183782e+18,1425.576709,7700.157,6.477548,481.580646,2.006541,1690.318934,1492.255301,7.462878,16237.851636
min,-9.221952e+18,0.0,0.0,0.0,0.0,0.0,0.0,-432169.795773,0.0,0.0
25%,-4.28708e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.441136e+17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1039.867168
75%,4.478151e+18,290.0,271.3875,4.0,61.248321,0.0,0.0,42.705519,7.0,9572.432815
max,9.221475e+18,36798.0,2420897.0,180.0,60143.763871,66.0,557180.923178,309173.160776,154.0,662316.846415


In [63]:
data.columns

Index(['user_id', 'date', 'vertical', 'turnover_cash_num', 'turnover_cash_sum',
       'deposit_approved_num', 'deposit_approved_sum',
       'withdrawal_approved_num', 'withdrawal_approved_sum', 'NGR_sum',
       'session_num', 'session_sum'],
      dtype='object')

# Data preprocessing

## missing

In [64]:
# Checking missing values
data.isnull().sum()

user_id                        0
date                           0
vertical                   82659
turnover_cash_num              0
turnover_cash_sum              0
deposit_approved_num           0
deposit_approved_sum           0
withdrawal_approved_num        0
withdrawal_approved_sum        0
NGR_sum                        0
session_num                    0
session_sum                    0
dtype: int64

## outlier

In [65]:
# Convert relevant columns to numeric
columns_to_convert = ['turnover_cash_num', 'turnover_cash_sum', 'deposit_approved_num', 'deposit_approved_sum',
                      'withdrawal_approved_num', 'withdrawal_approved_sum', 'NGR_sum', 'session_num', 'session_sum']

for column in columns_to_convert:
    data[column] = pd.to_numeric(data[column], errors='coerce')

q1 = data[columns_to_convert].quantile(.25)
q3 = data[columns_to_convert].quantile(.75)
iqr =q3-q1

outliers = ((data[columns_to_convert] < (q1 - 1.5 * iqr)) | (data[columns_to_convert] > (q3 +1.5 * iqr))).sum
outliers

<bound method DataFrame.sum of         turnover_cash_num  turnover_cash_sum  deposit_approved_num  \
0                   False              False                 False   
1                   False               True                 False   
2                   False               True                 False   
3                   False               True                 False   
4                   False              False                 False   
...                   ...                ...                   ...   
158211              False              False                 False   
158212               True               True                 False   
158213              False              False                 False   
158214              False              False                 False   
158215              False              False                 False   

        deposit_approved_sum  withdrawal_approved_num  \
0                      False                    False   
1             

## One-hot encoding

- we need to encode the categorical columns

In [66]:
data = pd.get_dummies(data, columns=['vertical'])

In [67]:
data

Unnamed: 0,user_id,date,turnover_cash_num,turnover_cash_sum,deposit_approved_num,deposit_approved_sum,withdrawal_approved_num,withdrawal_approved_sum,NGR_sum,session_num,session_sum,vertical_bingo,vertical_casino_classic,vertical_casino_live,vertical_sports
0,-4233848618748012971,2020-06-19,4,41.662354,0,0.0,0,0.0,59.875489,0,0.0,False,False,False,True
1,2814934526452133975,2020-09-14,123,847.111687,0,0.0,0,0.0,81.685770,0,0.0,False,False,True,False
2,3718332242005503191,2020-09-06,399,21111.275179,0,0.0,0,0.0,2731.476087,0,0.0,False,False,True,False
3,-5271561423861853461,2020-09-06,60,825.512773,0,0.0,0,0.0,-364.196812,0,0.0,False,False,True,False
4,-3697770787649913393,2020-07-25,13,105.128915,0,0.0,0,0.0,105.128915,0,0.0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158211,-7117022959737387082,2020-06-20,158,216.515085,0,0.0,0,0.0,2.965960,0,0.0,False,False,True,False
158212,2811336782031493717,2020-03-12,899,1951.778557,0,0.0,0,0.0,567.065974,0,0.0,False,True,False,False
158213,-3328374110339852023,2020-02-27,0,0.000000,0,0.0,0,0.0,2.370862,0,0.0,False,False,True,False
158214,3047797546794493486,2020-03-25,4,0.287459,0,0.0,0,0.0,0.287459,0,0.0,False,False,True,False


# Feature engineering

- we will create some aditional features for this model

In [68]:
data['avg_turnover'] = data['turnover_cash_sum'] / data['turnover_cash_num']
data['total_transactions'] = data['deposit_approved_num'] + data['withdrawal_approved_num']
data['date'] = pd.to_datetime(data['date'])
data['recency'] = (data['date'].max() - data['date']).dt.days
data['frequency'] = data['session_num']
data['monetary_value'] = data['NGR_sum']
# Convert transaction_date to a timestamp
data['timestamp'] = data['date'].astype(int) / 10**9

# Alternatively, extract date components
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158216 entries, 0 to 158215
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   user_id                  158216 non-null  int64         
 1   date                     158216 non-null  datetime64[ns]
 2   turnover_cash_num        158216 non-null  int64         
 3   turnover_cash_sum        158216 non-null  float64       
 4   deposit_approved_num     158216 non-null  int64         
 5   deposit_approved_sum     158216 non-null  float64       
 6   withdrawal_approved_num  158216 non-null  int64         
 7   withdrawal_approved_sum  158216 non-null  float64       
 8   NGR_sum                  158216 non-null  float64       
 9   session_num              158216 non-null  int64         
 10  session_sum              158216 non-null  float64       
 11  vertical_bingo           158216 non-null  bool          
 12  vertical_casino_

# Model Selection and Training


In [70]:
# Aggregate the data to get the most recent date per user
recent_date_per_user = data.groupby('user_id')['date'].max().reset_index()

# Merge this back to the original data
data = data.merge(recent_date_per_user, on='user_id', suffixes=('', '_most_recent'))

# Define churn as users with no sessions or transactions in the last 30 days
data['churn_flag'] = (data['date_most_recent'].max() - data['date_most_recent']).dt.days > 30


In [71]:
data_indexed = data.set_index('user_id')
#split unique users
train_users, test_users = train_test_split(data_indexed.index.unique(), test_size=.2, random_state=24)
train_data = data_indexed.loc[train_users].reset_index()
test_data = data_indexed.loc[test_users].reset_index()

#check for common users between train and test
common_users = set(train_data['user_id']).intersection(set(test_data['user_id']))
if common_users:
    print("overlap", common_users)
else:
    print("No overlap")

No overlap


In [72]:
X_train = train_data.drop(columns=['user_id', 'date','date_most_recent','churn_flag'])
y_train = train_data['churn_flag']

X_test = test_data.drop(columns=['user_id', 'date','date_most_recent','churn_flag'])
y_test = test_data['churn_flag']


In [73]:
# Replace inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for excessively large values (e.g., larger than a threshold like 1e10)
threshold = 1e10
X_train[X_train > threshold] = np.nan
X_test[X_test > threshold] = np.nan

# Now reapply the imputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

#X = pd.DataFrame(X_imputed, columns=X.columns)


## Logistic Regression

In [74]:
# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Logistic Regression - Accuracy: {accuracy}')
print(f'Logistic Regression - Precision: {precision}')
print(f'Logistic Regression - Recall: {recall}')
print(f'Logistic Regression - F1 Score: {f1}')


Logistic Regression - Accuracy: 0.6865723488915556
Logistic Regression - Precision: 0.6666666666666666
Logistic Regression - Recall: 0.00020149103364900262
Logistic Regression - F1 Score: 0.00040286030818813577


- Let's try to scale the values for the Logistic model

In [75]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model again
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Scaled Logistic - Accuracy: {accuracy}')
print(f'Scaled Logistic - Precision: {precision}')
print(f'Scaled Logistic - Recall: {recall}')
print(f'Scaled Logistic - F1 Score: {f1}')


Scaled Logistic - Accuracy: 0.7348575759489674
Scaled Logistic - Precision: 0.642511177347243
Scaled Logistic - Recall: 0.347471287527705
Scaled Logistic - F1 Score: 0.4510265463580489


## Random Forest

In [76]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f'Random Forest - Accuracy: {accuracy_rf}')
print(f'Random Forest - Precision: {precision_rf}')
print(f'Random Forest - Recall: {recall_rf}')
print(f'Random Forest - F1 Score: {f1_rf}')


Random Forest - Accuracy: 0.6941830354323248
Random Forest - Precision: 0.5172168468981218
Random Forest - Recall: 0.36620995365706227
Random Forest - F1 Score: 0.4288073610947269


- Hyperparameter tuning

In [77]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
print(grid_search.best_params_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}


In [79]:
# Initialize the Random Forest with the best parameters
optimized_rf_model = RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=100, random_state=42)

# Train the optimized model
optimized_rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_optimized_rf = optimized_rf_model.predict(X_test)
accuracy_optimized_rf = accuracy_score(y_test, y_pred_optimized_rf)
precision_optimized_rf = precision_score(y_test, y_pred_optimized_rf)
recall_optimized_rf = recall_score(y_test, y_pred_optimized_rf)
f1_optimized_rf = f1_score(y_test, y_pred_optimized_rf)

print(f'Optimized Random Forest - Accuracy: {accuracy_optimized_rf}')
print(f'Optimized Random Forest - Precision: {precision_optimized_rf}')
print(f'Optimized Random Forest - Recall: {recall_optimized_rf}')
print(f'Optimized Random Forest - F1 Score: {f1_optimized_rf}')


Optimized Random Forest - Accuracy: 0.7281311185498642
Optimized Random Forest - Precision: 0.6700232378001549
Optimized Random Forest - Recall: 0.2614346161595809
Optimized Random Forest - F1 Score: 0.3761142111747228
