In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Facebook_metrics_Clustering.csv')

In [3]:
df

Unnamed: 0,post_type,category,post_month,post_weekday,post_hour,paid,total_interactions,post_ratio_engaged_consumers,post_ratio_imp_reach_log,post_ratio_imp_reach_like_log,post_ratio_engaged_consumption_log
0,Photo,2,12,4,3,0.0,100,1.633028,1.047293,1.056689,0.848498
1,Status,2,12,3,10,0.0,164,1.070536,1.037408,1.070180,0.920614
2,Photo,3,12,3,3,0.0,80,1.566372,1.033991,1.054634,0.773190
3,Photo,2,12,2,10,1.0,1777,2.798734,1.013536,1.066171,0.591867
4,Photo,2,12,2,3,0.0,393,1.636585,1.056605,1.080533,0.902048
...,...,...,...,...,...,...,...,...,...,...,...
491,Photo,3,1,7,2,0.0,84,1.035311,0.958922,0.975163,1.256401
492,Photo,2,1,5,8,0.0,75,1.057087,1.026021,1.058694,1.188572
493,Photo,1,1,5,2,0.0,115,1.092657,1.068155,1.093855,1.160047
494,Photo,3,1,4,11,0.0,136,1.090592,1.036744,1.047004,1.178239


#### PreProcessing Data
1. `Total_interactions` I have to handle outliers in it by applying `log` or `Sqrt` Scale
2. Split Data So `Encoding` dont result a `DataLeakge` to Our Model
3. Post_Type & Category Will be converted into `LabelEncoder`
4. `Post_month`, `post_weekday`, `post_hour` all of them i will converte them into `OneHotEncoding`

In [4]:
df.describe()

Unnamed: 0,category,post_month,post_weekday,post_hour,paid,total_interactions,post_ratio_engaged_consumers,post_ratio_imp_reach_log,post_ratio_imp_reach_like_log,post_ratio_engaged_consumption_log
count,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0
mean,1.883065,7.086694,4.143145,7.832661,0.278226,211.606855,1.187947,1.037933,1.065713,1.105461
std,0.851876,3.276261,2.034469,4.378347,0.448578,381.252265,0.22823,0.163646,0.171456,0.363115
min,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.775999,0.861094,0.476016
25%,1.0,4.0,2.0,3.0,0.0,71.0,1.048937,0.980253,1.002351,0.910202
50%,2.0,7.0,4.0,9.0,0.0,122.5,1.10407,1.015905,1.03543,1.029461
75%,3.0,10.0,6.0,11.0,1.0,228.5,1.228392,1.056337,1.080672,1.182131
max,3.0,12.0,7.0,23.0,1.0,6334.0,2.798734,3.151465,3.174056,3.553239


#### Converting Total Interaction to logarithm
1. in `total_interactions` mean `212.120000` while std `380.233118` std > mean it will indicate a `High-Variance`
2. With A high-Variance or high `std` model will get confused and for Sure Error Will be huge esspicially in `r2`
3. Before Doing the `log` as we see `75%` was `228.5` and Max was `63334` which means it's `28x` from the max
4. After Doing the `log` as we see `75%` is `5.4` and Max is `8.75` but now it's `1.6x` more But the Order of them will stay the Same But we just Dicreased Effection of Highest Number



In [5]:
df['total_interactions'] = np.log1p(df['total_interactions'])

#### Define X (Features) and Y (Target)

In [6]:
X = df.drop('total_interactions',axis=1)
y = df['total_interactions']

In [7]:
X

Unnamed: 0,post_type,category,post_month,post_weekday,post_hour,paid,post_ratio_engaged_consumers,post_ratio_imp_reach_log,post_ratio_imp_reach_like_log,post_ratio_engaged_consumption_log
0,Photo,2,12,4,3,0.0,1.633028,1.047293,1.056689,0.848498
1,Status,2,12,3,10,0.0,1.070536,1.037408,1.070180,0.920614
2,Photo,3,12,3,3,0.0,1.566372,1.033991,1.054634,0.773190
3,Photo,2,12,2,10,1.0,2.798734,1.013536,1.066171,0.591867
4,Photo,2,12,2,3,0.0,1.636585,1.056605,1.080533,0.902048
...,...,...,...,...,...,...,...,...,...,...
491,Photo,3,1,7,2,0.0,1.035311,0.958922,0.975163,1.256401
492,Photo,2,1,5,8,0.0,1.057087,1.026021,1.058694,1.188572
493,Photo,1,1,5,2,0.0,1.092657,1.068155,1.093855,1.160047
494,Photo,3,1,4,11,0.0,1.090592,1.036744,1.047004,1.178239


#### ColumnsTransfer and Pipeline

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import cross_val_score,KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [9]:
column_transfer = make_column_transformer(
    (OneHotEncoder(),['post_type']),
    remainder='passthrough')

In [13]:
pipe = make_pipeline(
    column_transfer,RandomForestRegressor(max_depth=100,n_estimators=300)
)
pipe2 = make_pipeline(
    column_transfer,DecisionTreeRegressor(max_depth=20,ccp_alpha=0.001)
)
pipe3 = make_pipeline(
    column_transfer,GradientBoostingRegressor(
       subsample=0.8, 
    n_estimators=300, 
    min_samples_split=10, 
    max_depth=3, 
    learning_rate=0.05, 
    random_state=42
))
pipe4 =  make_pipeline(
    column_transfer,
XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    random_state=42,
    n_jobs=-1
))
dict_model = {0:'RandomForest',
              1:'DecisionTree',
              2:'GradientBoost',
              3:'XGBoost'
             }

piplines = [pipe,pipe2,pipe3,pipe4]

In [14]:

for i,j in enumerate(piplines):
    kf = KFold(n_splits=5, shuffle=True, random_state=2000)
    cv_results = cross_val_score(j, X, y, cv=kf, scoring='r2').mean()
    print(f'Model: {dict_model[i]} Score is: {cv_results}')

Model: RandomForest Score is: 0.7670481676773525
Model: DecisionTree Score is: 0.6136718401925267
Model: GradientBoost Score is: 0.7977293979513661
Model: XGBoost Score is: 0.7677925556319043


#### Results Of Cross-Validation
* As we see Decision Tree is not making 

#### Fitting Model

In [46]:
df.head()

Unnamed: 0,post_month,post_weekday,post_hour,paid,total_interactions,post_ratio_engaged_consumers,post_ratio_imp_reach_log,post_ratio_imp_reach_like_log,post_ratio_engaged_consumption_log,category_1,category_2,category_3,post_type_Link,post_type_Photo,post_type_Status,post_type_Video
0,12,4,3,0.0,4.615121,1.633028,1.047293,1.056689,0.848498,False,True,False,False,True,False,False
1,12,3,10,0.0,5.105945,1.070536,1.037408,1.07018,0.920614,False,True,False,False,False,True,False
2,12,3,3,0.0,4.394449,1.566372,1.033991,1.054634,0.77319,False,False,True,False,True,False,False
3,12,2,10,1.0,7.483244,2.798734,1.013536,1.066171,0.591867,False,True,False,False,True,False,False
4,12,2,3,0.0,5.976351,1.636585,1.056605,1.080533,0.902048,False,True,False,False,True,False,False


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score,KFold

In [53]:
best_gb_model = GradientBoostingRegressor(
    subsample=0.8, 
    n_estimators=300, 
    min_samples_split=10, 
    max_depth=3, 
    learning_rate=0.05, 
    random_state=42
)

# Explicit Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=2000)
cv_results = cross_val_score(best_gb_model, X, y, cv=kf, scoring='r2')

print(f"Individual CV Scores (R2): {cv_results}")
print(f"Mean Cross-Validation R2: {np.mean(cv_results):.4f}")
print(f"Standard Deviation: {np.std(cv_results):.4f}")

Individual CV Scores (R2): [0.80083844 0.82699594 0.79289613 0.73292841 0.81350709]
Mean Cross-Validation R2: 0.7934
Standard Deviation: 0.0324


In [54]:
best_gb_model.fit(X_train,y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,300
,subsample,0.8
,criterion,'friedman_mse'
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [55]:
best_gb_model.score(X_test,y_test)

0.7880539899451597

In [211]:
xgb_param_grid = {
    'xgbregressor__n_estimators': [300, 500, 700],
    'xgbregressor__learning_rate': [0.03, 0.05, 0.1],
    'xgbregressor__max_depth': [2, 3, 4],
    'xgbregressor__min_child_weight': [3, 5, 7],
    'xgbregressor__subsample': [0.7, 0.8, 0.9],
    'xgbregressor__colsample_bytree': [0.7, 0.8, 0.9],
    'xgbregressor__reg_lambda': [1, 5, 10],
    'xgbregressor__reg_alpha': [0, 0.1, 1]
}


In [212]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [213]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2000)

In [214]:
grid_search =GridSearchCV(estimator=pipe4,param_grid=xgb_param_grid,cv=kf,scoring='r2')

In [215]:
grid_search.fit(X_train,y_train)

0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'xgbregressor__colsample_bytree': [0.7, 0.8, ...], 'xgbregressor__learning_rate': [0.03, 0.05, ...], 'xgbregressor__max_depth': [2, 3, ...], 'xgbregressor__min_child_weight': [3, 5, ...], ...}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [216]:
grid_search.best_params_

{'xgbregressor__colsample_bytree': 0.9,
 'xgbregressor__learning_rate': 0.05,
 'xgbregressor__max_depth': 3,
 'xgbregressor__min_child_weight': 3,
 'xgbregressor__n_estimators': 700,
 'xgbregressor__reg_alpha': 0.1,
 'xgbregressor__reg_lambda': 10,
 'xgbregressor__subsample': 0.8}

In [221]:
grid_search.score(X_test,y_test)

0.7784128147628122

In [192]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

scores = cross_val_score(
    pipe4,
    X,
    y,
    cv=ss,
    scoring='r2'
)

scores, scores.mean(), scores.std()


(array([0.75149195, 0.71947674, 0.71741159, 0.64897024, 0.7598501 ,
        0.71260726, 0.77009726, 0.78019785, 0.7642113 , 0.73615554]),
 np.float64(0.7360469827115976),
 np.float64(0.03668886417102805))

In [193]:
from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

scores = cross_val_score(
    pipe4,
    X,
    y,
    cv=rkf,
    scoring='r2'
)

scores.mean(), scores.std()


(np.float64(0.7300077548193574), np.float64(0.043764951187733436))