In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from numpy import absolute
import xgboost as xgb
%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Training DataSet")
display(train_df.head())
print("Shape of training dataset {}".format(train_df.shape))
print("\n\nTesting DataSet")
display(test_df.head())
print("Shape of testing dataset {}".format(test_df.shape))

Training DataSet


Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


Shape of training dataset (89197, 10)


Testing DataSet


Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
0,89198,7986,12,42,14,Male,Student,180,138
1,89199,11278,34,115,14,Male,Student,230,840
2,89200,17245,8,110,44,Female,Working Professional,280,628
3,89201,9851,16,137,18,Male,Student,270,462
4,89202,16008,34,96,47,Female,Other,230,840


Shape of testing dataset (11121, 9)


In [3]:
profLabel = {'Student': 0, 'Working Professional':1, 'Other':2}
genderLabel = {'Male': 0, 'Female':1}
train_df.profession = train_df.profession.map(profLabel)
train_df.gender = train_df.gender.map(genderLabel)

train_df.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,0,0,180,1000,4.33
1,2,5304,32,132,14,1,0,330,714,1.79
2,3,1840,12,24,19,0,0,180,138,4.35
3,4,12597,23,112,19,0,0,220,613,3.77
4,5,13626,23,112,27,0,1,220,613,3.13


In [4]:
temp_df = train_df[['video_id', 'category_id', 'followers']].groupby(['category_id', 'video_id']).count().reset_index().rename(columns = {'followers':'weight'})
display(temp_df.head())
weighted_train_df = train_df.merge(temp_df, on = ['video_id', 'category_id'])

display(weighted_train_df.head())

Unnamed: 0,category_id,video_id,weight
0,1,1,591
1,1,2,622
2,1,15,597
3,2,3,167
4,3,4,932


Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,weight
0,1,19990,37,128,24,0,0,180,1000,4.33,503
1,244,1684,37,128,37,0,2,180,1000,2.48,503
2,655,26609,37,128,21,0,0,180,1000,4.62,503
3,934,15281,37,128,45,1,1,180,1000,1.15,503
4,1029,20227,37,128,30,0,2,180,1000,4.08,503


In [5]:
scaler = StandardScaler()
weight_tranform_data = scaler.fit_transform(weighted_train_df[['followers', 'views']], sample_weight=weighted_train_df.weight)
weight_tranform_df = pd.DataFrame(weight_tranform_data, columns=['followers', 'views'])
weight_tranform_df.head()

weighted_train_df['followers'] = weight_tranform_df['followers']
weighted_train_df['views'] = weight_tranform_df['views']
display(weighted_train_df.head())

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,weight
0,1,19990,37,128,24,0,0,-1.54655,1.913246,4.33,503
1,244,1684,37,128,37,0,2,-1.54655,1.913246,2.48,503
2,655,26609,37,128,21,0,0,-1.54655,1.913246,4.62,503
3,934,15281,37,128,45,1,1,-1.54655,1.913246,1.15,503
4,1029,20227,37,128,30,0,2,-1.54655,1.913246,4.08,503


In [6]:
scaler = StandardScaler()
weight_tranform_data = scaler.fit_transform(weighted_train_df[['age']])
weight_tranform_df = pd.DataFrame(weight_tranform_data, columns=['age'])

weighted_train_df['age'] = weight_tranform_df['age']

display(weighted_train_df.head())

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,weight
0,1,19990,37,128,-0.094759,0,0,-1.54655,1.913246,4.33,503
1,244,1684,37,128,1.356865,0,2,-1.54655,1.913246,2.48,503
2,655,26609,37,128,-0.42975,0,0,-1.54655,1.913246,4.62,503
3,934,15281,37,128,2.250172,1,1,-1.54655,1.913246,1.15,503
4,1029,20227,37,128,0.575221,0,2,-1.54655,1.913246,4.08,503


In [7]:
training_df = weighted_train_df[['age', 'gender', 'profession', 'followers', 'views', 'engagement_score']]
training_df.head()

Unnamed: 0,age,gender,profession,followers,views,engagement_score
0,-0.094759,0,0,-1.54655,1.913246,4.33
1,1.356865,0,2,-1.54655,1.913246,2.48
2,-0.42975,0,0,-1.54655,1.913246,4.62
3,2.250172,1,1,-1.54655,1.913246,1.15
4,0.575221,0,2,-1.54655,1.913246,4.08


In [8]:
X, y = (training_df[['age', 'gender', 'profession', 'followers', 'views']], training_df[['engagement_score']])


display(X.head())
display(y.head())

Unnamed: 0,age,gender,profession,followers,views
0,-0.094759,0,0,-1.54655,1.913246
1,1.356865,0,2,-1.54655,1.913246
2,-0.42975,0,0,-1.54655,1.913246
3,2.250172,1,1,-1.54655,1.913246
4,0.575221,0,2,-1.54655,1.913246


Unnamed: 0,engagement_score
0,4.33
1,2.48
2,4.62
3,1.15
4,4.08


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.1, learning_rate = 0.3, tree_method='hist', num_parallel_tree=3, max_depth = 5, alpha = 0.3, n_estimators = 1000)

# reg:logistic -> MSE -> 0.713, r2 -> 0.312
# reg:pseudohubererror -> MSE 0.71, r2 -> 0.31
# reg:tweedie -> MSE ->, r2 ->
xg_reg

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)  # default 5, 10

In [None]:
scores_cv = cross_val_score(xg_reg, X, y, scoring='r2', cv=cv, n_jobs=-1)


In [None]:
scores = absolute(scores_cv)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

display(scores_cv)
display(scores)

In [None]:
xg_reg

In [None]:
xg_reg.fit(X_train,y_train)

xg_reg

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
r2_score(y_test, preds)

In [None]:
xg_ada_reg = AdaBoostRegressor(base_estimator = xg_reg,  n_estimators=100)
print(xg_ada_reg)

In [None]:
xg_ada_reg.fit(X_train,y_train)

In [None]:
xg_ada_pred = xg_ada_reg.predict(X_test)

In [None]:
r2_score(y_test, xg_ada_pred)

In [11]:
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor

In [None]:
ada_reg = AdaBoostRegressor(base_estimator = lgb_reg,  n_estimators=20, learning_rate=0.03)
print(ada_reg)


### - cross validataion 
#scores = cross_val_score(ada_reg, xtrain,ytrain,cv=5)
#print("Mean cross-validataion score: %.2f" % scores.mean())

# k-fold cross validataion 
#kfold = KFold(n_splits=10, shuffle=True)
#kf_cv_scores = cross_val_score(ada_reg, xtrain, ytrain, cv=kfold, scoring='r2' )
#print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

In [None]:

ada_reg.fit(X_train,y_train)

In [None]:
ada_pred = ada_reg.predict(X_test)

In [None]:
r2_score(y_test, ada_pred)

In [None]:
ada_reg.estimator_errors_

In [98]:
#lgb_reg = LGBMRegressor(n_estimators=10000, objective='regression', learning_rate=0.01)
#"boosting_type": ['gbdt','dart', 'goss', 'rf'],
##"learning_rate":[0.01, 0.03, 0.1, 0.3],
#"n_estimators":[1000,10000],
#"objective":'regression',

#, metric="mape"
lgb_reg = LGBMRegressor(n_estimators=5000, max_depth=10, objective='regression', learning_rate=0.01, boosting_type='dart', xgboost_dart_mode=True, metric="mape")
lgb_reg

LGBMRegressor(boosting_type='dart', learning_rate=0.01, max_depth=10,
              metric='mape', n_estimators=5000, objective='regression',
              xgboost_dart_mode=True)

In [99]:
lgb_reg.fit(X_train,y_train, eval_metric='r2')

LGBMRegressor(boosting_type='dart', learning_rate=0.01, max_depth=10,
              metric='mape', n_estimators=5000, objective='regression',
              xgboost_dart_mode=True)

In [100]:
lgb_reg.learning_rate

0.01

In [101]:
lgb_pred = lgb_reg.predict(X_test)

In [104]:
r2_score(y_test, lgb_pred)

0.3389968034510652

In [103]:
lgb_reg.score(X, y)

0.37042170338200897

In [None]:
without mape -> 0.3389968034510652

possion => 0.3384065948650786
tweedie => 0.3385146980285705

In [93]:
ada_reg = AdaBoostRegressor(base_estimator = lgb_reg,  n_estimators=20, learning_rate=0.03, loss='square')
print(ada_reg)

AdaBoostRegressor(base_estimator=LGBMRegressor(boosting_type='dart',
                                               learning_rate=0.01, max_depth=10,
                                               n_estimators=5000,
                                               objective='regression',
                                               xgboost_dart_mode=True),
                  learning_rate=0.03, loss='square', n_estimators=20)


In [94]:
ada_reg.fit(X_train,y_train)

  return f(*args, **kwargs)


AdaBoostRegressor(base_estimator=LGBMRegressor(boosting_type='dart',
                                               learning_rate=0.01, max_depth=10,
                                               n_estimators=5000,
                                               objective='regression',
                                               xgboost_dart_mode=True),
                  learning_rate=0.03, loss='square', n_estimators=20)

In [95]:
ada_pred = ada_reg.predict(X_test)

In [96]:
r2_score(y_test, ada_pred)

0.33882021164481413

In [97]:
ada_reg.estimator_errors_

array([0.03299871, 0.03129085, 0.0344539 , 0.03427867, 0.03339027,
       0.03560762, 0.03521575, 0.03705942, 0.03747329, 0.03849859,
       0.03811355, 0.03900515, 0.039584  , 0.03960465, 0.0404152 ,
       0.04241623, 0.04280423, 0.04314803, 0.04387373, 0.04354808])

In [None]:
ada loss square => 0.33882021164481413


In [None]:
dart -> 0.335
goss -> 0.3303

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
#alpha=1.0, l1_ratio=0.5
en_reg = ElasticNet()
en_reg

In [None]:
en_reg.fit(X_train,y_train)

In [None]:
en_pred = en_reg.predict(X_test)

In [None]:
r2_score(y_test, en_pred)

In [None]:
from sklearn.svm import SVR

svr_reg = SVR(C=1.0, epsilon=0.2, kernel='poly', degree=5)
print(svr_reg)

svr_reg.fit(X_train,y_train)


In [None]:
svr_pred = svr_reg.predict(X_test)
r2_score(y_test, svr_pred)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, svr_pred))
print("RMSE: %f" % (rmse))

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=100, tol=1e-3)

In [None]:
sgd_reg.fit(X_train,y_train)

In [None]:
svr_pred = svr_reg.predict(X_test)
r2_score(y_test, svr_pred)

In [None]:
from scipy.stats import uniform
uniform(loc=0, scale=4).rvs()

In [None]:
# "max_dept":[1,3,5,-1],
# "eval_metric": ['r2', 'neg_mean_absolute_error']


param = {

       }
param

In [113]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [114]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [116]:
# scores_cv = cross_val_score(xg_reg, X, y, scoring='r2', cv=cv, n_jobs=-1)
# lgb_reg -> LGBM
# ada_reg -> adaboost
new_lgb_reg = GridSearchCV(lgb_reg, {}, scoring='r2', cv=ss)

In [117]:
new_lgb_reg.fit(X,y)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=0, test_size=0.3, train_size=None),
             estimator=LGBMRegressor(boosting_type='dart', learning_rate=0.01,
                                     max_depth=10, metric='mape',
                                     n_estimators=5000, objective='regression',
                                     xgboost_dart_mode=True),
             param_grid={}, scoring='r2')

In [118]:
new_lgb_reg.best_estimator_

LGBMRegressor(boosting_type='dart', learning_rate=0.01, max_depth=10,
              metric='mape', n_estimators=5000, objective='regression',
              xgboost_dart_mode=True)

In [119]:
new_lgb_reg.cv_results_

{'mean_fit_time': array([160.88321033]),
 'std_fit_time': array([2.54883768]),
 'mean_score_time': array([2.29816771]),
 'std_score_time': array([0.12444894]),
 'params': [{}],
 'split0_test_score': array([0.35075803]),
 'split1_test_score': array([0.3573977]),
 'split2_test_score': array([0.35243162]),
 'split3_test_score': array([0.34971424]),
 'split4_test_score': array([0.35268182]),
 'mean_test_score': array([0.35259668]),
 'std_test_score': array([0.00263717]),
 'rank_test_score': array([1])}

In [120]:
new_lgb_pred = new_lgb_reg.predict(X_test)

In [121]:
r2_score(y_test, new_lgb_pred)

0.36244766896511993

In [128]:
new_lgb_reg.refit_time_


211.65329575538635

In [3]:
from xgboost import XGBClassifier
xgBoost = XGBClassifier()
xgBoost.fit(fs_X, y)

In [None]:
y_pred = clf.predict(fs_X_test)


In [None]:
out_df = pd.DataFrame(list(zip(custo_id_df, y_pred)), columns=['customer_id', 'Y'])
out_df.to_csv("xgBoostBase.csv", index=False)