In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score,mean_squared_log_error,mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89197 entries, 0 to 89196
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_id            89197 non-null  int64  
 1   user_id           89197 non-null  int64  
 2   category_id       89197 non-null  int64  
 3   video_id          89197 non-null  int64  
 4   age               89197 non-null  int64  
 5   gender            89197 non-null  object 
 6   profession        89197 non-null  object 
 7   followers         89197 non-null  int64  
 8   views             89197 non-null  int64  
 9   engagement_score  89197 non-null  float64
dtypes: float64(1), int64(7), object(2)
memory usage: 6.8+ MB


In [4]:
train.shape

(89197, 10)

In [5]:
train.info()  #no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89197 entries, 0 to 89196
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_id            89197 non-null  int64  
 1   user_id           89197 non-null  int64  
 2   category_id       89197 non-null  int64  
 3   video_id          89197 non-null  int64  
 4   age               89197 non-null  int64  
 5   gender            89197 non-null  object 
 6   profession        89197 non-null  object 
 7   followers         89197 non-null  int64  
 8   views             89197 non-null  int64  
 9   engagement_score  89197 non-null  float64
dtypes: float64(1), int64(7), object(2)
memory usage: 6.8+ MB


In [6]:
test.shape

(11121, 9)

In [7]:
# Segregating categorical and numeric columns

categorical_cols = []
numeric_cols = []
for col in train.columns:
#     print(col)
    if train[col].dtype == 'object':
#         print(col, train_data[col].dtype)
        categorical_cols.append(col)
    elif col == 'engagement_score':
        target_col = ['engagement_score']
    else:
        numeric_cols.append(col)
print('The categorical cols are: ', categorical_cols)
print('The numeric cols are:', numeric_cols)
print('The target cols are:', target_col)


The categorical cols are:  ['gender', 'profession']
The numeric cols are: ['row_id', 'user_id', 'category_id', 'video_id', 'age', 'followers', 'views']
The target cols are: ['engagement_score']


In [8]:
# Getting the count distribution of different categorical variables.

for col in categorical_cols:
    print('Column:', col)
    print(train[col].value_counts())

Column: gender
Male      52397
Female    36800
Name: gender, dtype: int64
Column: profession
Student                 44638
Other                   26840
Working Professional    17719
Name: profession, dtype: int64


# Encode categorical features:-

In [9]:
for i in categorical_cols:
    le = LabelEncoder()
    train[i]=le.fit_transform(train[i])
    test[i]=le.transform(test[i])

In [10]:
train.corr()['engagement_score'].sort_values(ascending= False) #checking correlation

engagement_score    1.000000
gender              0.408702
followers           0.000855
user_id            -0.002900
row_id             -0.003368
video_id           -0.030552
profession         -0.049821
views              -0.084679
category_id        -0.094288
age                -0.232626
Name: engagement_score, dtype: float64

In [11]:
#Splitting the dataset into train test and validation.
# 
y = train[target_col]
y

Unnamed: 0,engagement_score
0,4.33
1,1.79
2,4.35
3,3.77
4,3.13
...,...
89192,3.91
89193,3.56
89194,4.23
89195,3.77


In [None]:
# dropping 'row_id' column as rules

In [12]:
X = train.drop(columns=['engagement_score', 'row_id'],axis=1)

In [13]:
X_test = test.drop('row_id', axis=1)

In [None]:
# Splitting to training and testing data

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                            random_state=42)

In [17]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((71357, 8), (17840, 8), (71357, 1), (17840, 1))

In [18]:
X_train.head()

Unnamed: 0,user_id,category_id,video_id,age,gender,profession,followers,views
78167,2351,18,58,19,1,1,180,444
34090,17171,8,67,31,1,0,280,628
78320,4511,4,99,31,1,0,230,781
26288,21389,11,22,37,0,2,250,180
38994,21395,5,109,33,1,2,240,229


# Model:-

In [None]:
import xgboost as xgb

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Create the params
params = { 'max_depth': max_depth,
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': n_estimators}
xgbr = xgb.XGBRegressor(seed = 20)
xgb_rcv = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
xgb_rcv.fit(X, y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [141]:
print("Best parameters:", xgb_rcv.best_params_)

print("Lowest RMSE: ", (-xgb_rcv.best_score_)**(1/2.0))

Best parameters: {'subsample': 0.8999999999999999, 'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'learning_rate': 0.01, 'bootstrap': True}
-0.4539080413669886


In [142]:
# Model prediction on train data
y_pred = xgb_rcv.predict(X_train)
y_pred


array([3.1192436, 3.5351958, 3.4986644, ..., 2.8230906, 4.584946 ,
       3.328687 ], dtype=float32)

In [143]:
# predict on the test set
pred = xgb_rcv.predict(X_test)

pred

array([4.1567364, 3.4421568, 2.806923 , ..., 3.3249047, 3.785227 ,
       3.1787293], dtype=float32)

In [144]:
# Model Evaluation
print('R^2:             ',r2_score(y_train, y_pred))
print('Adjusted R^2:    ',1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:             ',mean_absolute_error(y_train,y_pred))
print('MSE:             ',mean_squared_error(y_train, y_pred))
print('RMSE:            ',np.sqrt(mean_squared_error(y_train, y_pred)))

R^2:              0.8786556502292476
Adjusted R^2:     0.8786420443146016
MAE:              0.22009363530889414
MSE:              0.09017666729110854
RMSE:             0.30029430112992245


# Submission:-

In [145]:
submission= pd.DataFrame(columns=['row_id','engagement_score'])
submission

submission['row_id'] = test['row_id']

submission

submission['engagement_score'] = pred

submission


#creating the submission csv file
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,row_id,engagement_score
0,89198,4.156736
1,89199,3.442157
2,89200,2.806923
3,89201,3.591595
4,89202,2.127187
...,...,...
11116,100314,3.352441
11117,100315,3.105995
11118,100316,3.324905
11119,100317,3.785227
