In [1]:
#Importing required libraries
!pip install numpy
!pip install pandas
!pip install scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings("ignore")

#mount the data from google drive
from google.colab import drive
drive.mount('/content/drive/')
# load the bioresponse data as data
df=pd.read_csv("/content/drive/My Drive/Colab Notebooks/predict.csv")


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# To  check the null values in the data
df.isnull().sum()

recency                         0
topic_rank                      0
diversity                       0
authors_mean_rank               6
authors_mean_hindex             6
authors_mean_gindex             6
authors_mean_sociality          6
authors_mean_pagerank         479
authors_mean_productivity     479
journal_pagerank              479
journal_rank                   39
title_len                       0
abstract_len                    0
n_authors                       0
c5                              0
log_authors_mean_sociality      6
dtype: int64

In [3]:
# Dropping the feature colums which have no significance.
# authors_mean_sociality is removed as it is completly corelated to log_authors_mean_sociality.
df = df.drop(['authors_mean_pagerank', 'authors_mean_productivity', 'journal_pagerank', 'authors_mean_sociality'], axis=1)

In [4]:
df.head()

Unnamed: 0,recency,topic_rank,diversity,authors_mean_rank,authors_mean_hindex,authors_mean_gindex,journal_rank,title_len,abstract_len,n_authors,c5,log_authors_mean_sociality
0,16,1.0,-0.425436,500.0,1.0,1.0,101.0,66,1653,7,1.0,1.94591
1,8,7.0,-0.571967,134.0,1.0,1.0,24.0,59,1527,7,32.0,1.94591
2,15,19.0,-0.859644,235.0,1.0,1.0,49.0,169,1554,6,5.0,1.791759
3,6,4.0,-1.054999,1124.0,1.0,1.0,185.0,130,2121,11,7.0,2.397895
4,14,2.0,-0.476697,346.0,1.0,1.0,72.0,100,432,3,7.0,1.098612


In [5]:
df.describe()

Unnamed: 0,recency,topic_rank,diversity,authors_mean_rank,authors_mean_hindex,authors_mean_gindex,journal_rank,title_len,abstract_len,n_authors,c5,log_authors_mean_sociality
count,479.0,479.0,479.0,473.0,473.0,473.0,440.0,479.0,479.0,479.0,479.0,473.0
mean,13.419624,10.125261,-0.675481,741.769615,0.844051,0.844668,146.184091,94.983299,1162.283925,4.36952,6.713987,1.264066
std,5.860415,5.748655,0.606013,581.948922,0.363961,0.364351,111.398299,38.524238,657.597003,2.800775,12.547968,0.703466
min,5.0,1.0,-2.995732,1.0,0.0,0.0,1.0,16.0,0.0,1.0,0.0,0.0
25%,8.0,5.0,-1.087146,265.0,1.0,1.0,53.75,68.0,758.5,2.0,0.0,0.693147
50%,13.0,10.0,-0.526332,596.0,1.0,1.0,122.0,91.0,1197.0,4.0,2.0,1.386294
75%,18.0,15.0,-0.150066,1141.0,1.0,1.0,216.75,116.5,1587.5,6.0,8.0,1.791759
max,25.0,20.0,-0.056685,2336.0,1.125,1.166667,436.0,279.0,3655.0,16.0,154.0,2.772589


In [6]:
# Filling the nan values with median or mode value of the feature.
df['authors_mean_gindex'].fillna(1, inplace=True)
df['authors_mean_hindex'].fillna(1, inplace=True)
df['log_authors_mean_sociality'].fillna(df['log_authors_mean_sociality'].median(), inplace=True)
df['authors_mean_rank'].fillna(df['authors_mean_rank'].median(), inplace=True)
df['journal_rank'].fillna(df['journal_rank'].median(), inplace=True)

In [7]:
# separagtin the labels column
y = df['c5']
X = df.drop('c5', axis=1)

In [8]:
# Building gradiend boosting regression model
# As we have very few data sample, boosting should be used to overcome underfitting conditions.

grad_model=GradientBoostingRegressor(random_state=1)

# Using crossvalidation to for better estimation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=0)

# Training the model using for cv data.
grad_scores= cross_validate(grad_model, X, y, cv=cv, scoring=('r2', 'neg_mean_squared_error'), return_train_score=True)

In [9]:
# Caluclating the test and train accuracy
print('Test R2 value: {}' .format(grad_scores['test_r2'].mean()))
print('Train R2 value: {}' .format(grad_scores['train_r2'].mean()))

Test R2 value: 0.7956819728169684
Train R2 value: 0.9821895328261278
