In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

In [6]:
df=pd.read_csv('2-Model_Input.csv')
df.shape

(50, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                50 non-null     object 
 1   age                    50 non-null     int64  
 2   gender                 50 non-null     int64  
 3   past_engagement_score  50 non-null     float64
 4   ID                     50 non-null     int64  
 5   Engagement             50 non-null     int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 2.5+ KB


In [8]:
df.head()

Unnamed: 0,user_id,age,gender,past_engagement_score,ID,Engagement
0,U1,24,0,0.61,0,1
1,U2,32,0,0.93,3,1
2,U3,28,-1,0.4,0,0
3,U4,25,1,0.53,5,0
4,U5,24,1,0.8,5,1


In [9]:
y = df['Engagement']
x = df.drop(['user_id','Engagement'], axis=1)

In [10]:
parameters = {"max_depth": [2,5,10],
              "min_samples_leaf": [2,5,10],
              "n_estimators": [50,100,200]}
model = RandomForestClassifier(max_features='log2', random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters,scoring='roc_auc', cv=3, n_jobs=-1)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)



0.9675925925925926
{'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 100}


In [11]:
model = RandomForestClassifier(max_depth=5, min_samples_leaf=2, n_estimators=100, 
                              max_features='log2', random_state=0)
model.fit(x,y)
importances = model.feature_importances_
gb = pd.DataFrame({'Feature':x.columns,'Gini':importances}).sort_values('Gini', ascending=False) 
print(gb)

                 Feature      Gini
2  past_engagement_score  0.712263
0                    age  0.189860
3                     ID  0.080593
1                 gender  0.017283


In [12]:
x = x[['past_engagement_score','age','ID']]

In [13]:
parameters = {"max_depth": [2,5,10],
              "min_samples_leaf": [2,5,10],
              "n_estimators": [50,100,200]}
model = RandomForestClassifier(max_features='log2', random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters,scoring='roc_auc', cv=3, n_jobs=-1)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)


0.9652777777777778
{'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 50}


In [14]:
model = RandomForestClassifier(max_depth=5, min_samples_leaf=5, n_estimators=50, max_features='log2',
                               random_state=0)
model.fit(x,y)

In [15]:

print(x.shape)


pred1 = model.predict_proba(x)[:, 1]
pred2 = model.predict(x)
c1 = confusion_matrix(y, pred2)
print(c1)

p = c1[1][1] / (c1[0][1] + c1[1][1])
r = c1[1][1] / (c1[1][0] + c1[1][1])
f1 = (2 * p * r) / (p + r)
a1 = roc_auc_score(y, pred1)
print('AUC ROC:', np.round(a1, 3))
print('F1 Score:', np.round(f1, 3))



(50, 3)
[[22  1]
 [ 3 24]]
AUC ROC: 0.992
F1 Score: 0.923


In [16]:
md = grid_search.best_params_['max_depth']
ml = grid_search.best_params_['min_samples_leaf']
ne = grid_search.best_params_['n_estimators']


In [17]:
import joblib
from datetime import datetime
model = RandomForestClassifier(max_depth=md, min_samples_leaf=ml, n_estimators=ne, max_features='log2',
                                random_state=0)
model.fit(x, y)
joblib.dump(model, '3-Model RF '+datetime.today().strftime('%Y-%m-%d')+'.pkl') 

['3-Model RF 2025-09-30.pkl']

In [18]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   past_engagement_score  50 non-null     float64
 1   age                    50 non-null     int64  
 2   ID                     50 non-null     int64  
dtypes: float64(1), int64(2)
memory usage: 1.3 KB


In [19]:
model