### Predicts 10, 20 and 50 blogs out of test data which are going to receive most number of comments.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Load training data

In [2]:
df = pd.read_csv('/Users/gauravbishnoi/datas/blog_comments/blogData_train.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,280
0,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.44188,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0


---

Split data into training and test sets.
x_train, x_test, y_train and y_test split all 281 columns of data.
xtrain2, xtest2, ytrain2 and ytest2 split all columns except 'bag of words' of data. The removed columns were found to be not related to target data in analysis.

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.1, random_state=42)

In [8]:
xtrain2, xtest2, ytrain2, ytest2 = train_test_split(df.iloc[:,:61], df.iloc[:,-1], test_size=0.1, random_state=42)

---

Scoring Function. It prints fraction score for number of matches between predicted and actual test values. Scores for top 10, 20 and 50 posts according to number of comments received are generated. 

In [88]:
def score_it(pred, test):
    top50_pred = pred.argsort()[-50:][::-1]
    top50_test = test.values.argsort()[-50:][::-1]
    print("top 10 score: {}".format(sum(np.in1d(top50_pred[:10],top50_test[:10]))/10))
    print("top 20 score: {}".format(sum(np.in1d(top50_pred[:20],top50_test[:20]))/20))
    print("top 50 score: {}".format(sum(np.in1d(top50_pred,top50_test))/50))

---

Fitting and evaluating linear regression model.

In [89]:
# Function to predict scores using linear regression model
def try_ols(xtrain, xtest, ytrain, ytest):
    from sklearn.linear_model import LinearRegression
    ols = LinearRegression()
    ols.fit(xtrain, ytrain)
    ols_predictions = ols.predict(xtest)
    score_it(ols_predictions, ytest)

In [90]:
try_ols(x_train, x_test, y_train, y_test)

top 10 score: 0.2
top 20 score: 0.35
top 50 score: 0.48


Removing bag of words features.

In [91]:
try_ols(xtrain2, xtest2, ytrain2, ytest2)

top 10 score: 0.2
top 20 score: 0.35
top 50 score: 0.48


There is no effect of bag of words features on output

Try Ridge Regression, it may perform better than ols because of collinearity among features (as shown in heatmap in 'blog_analysis_quick.ipynb').

In [92]:
# Function for ridge regression.
def try_ridge(xtrain, xtest, ytrain, ytest):
    
    from sklearn.linear_model import Ridge
    from sklearn.model_selection import GridSearchCV
    
    # Search optimum parameters to tune ridge model using GridSearchCV
    alps = [0.01, 0.1, 1, 10]
    ridgegrid_param = {'alpha': alps}
    ridge_grid = GridSearchCV(estimator=Ridge(), param_grid=ridgegrid_param, verbose=3)
    ridge_grid.fit(xtrain, ytrain)
    alp = ridge_grid.best_estimator_.alpha # Best parameter for ridge model
    
    #fit ridge regression for optimum parameter and score it
    ridge = Ridge(alpha=alp)
    ridge.fit(xtrain, ytrain)
    ridge_predictions = ridge.predict(xtest)
    score_it(ridge_predictions, ytest)

In [93]:
try_ridge(xtrain2, xtest2, ytrain2, ytest2)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] alpha=0.01 ......................................................
[CV] ....................... alpha=0.01, score=0.401313, total=   0.1s
[CV] alpha=0.01 ......................................................
[CV] ....................... alpha=0.01, score=0.349296, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV] ....................... alpha=0.01, score=0.351982, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................ alpha=0.1, score=0.401317, total=   0.0s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ........................ alpha=0.1, score=0.349300, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................ alpha=0.1, score=0.351993, total=   0.0s
[CV] alpha=1 .........................................................
[CV] .......................... alpha=1, score=0.401355, total=   0.0s
[CV] alpha=1 .........................................................
[CV] .......................... alpha=1, score=0.349331, total=   0.0s
[CV] alpha=1 .........................................................
[CV] .......................... alpha=1, score=0.352048, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ......................... alpha=10, score=0.401501, total=   0.0s
[CV] alpha=10 ........................................................
[CV] ......................... alpha=10, score=0.349422, total=   0.0s
[CV] alpha=10 ........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.5s finished


Ridge regression is giving similar results to OLS

---

Decision Tree Regressor

In [94]:
# Function for Decision Tree Regressor
def try_dtree(xtrain, xtest, ytrain, ytest):
    from sklearn.tree import DecisionTreeRegressor
    dtree = DecisionTreeRegressor()
    dtree.fit(xtrain, ytrain)
    dtree_predictions = dtree.predict(xtest)
    score_it(dtree_predictions, ytest)

In [95]:
try_dtree(xtrain2, xtest2, ytrain2, ytest2)

top 10 score: 0.5
top 20 score: 0.4
top 50 score: 0.48


In [96]:
try_dtree(x_train, x_test, y_train, y_test)

top 10 score: 0.3
top 20 score: 0.4
top 50 score: 0.46


Data without 'bag of words' features is performing better.

---

Random Forest Regressor

In [97]:
# Function for random forest
def try_rfr(xtrain, xtest, ytrain, ytest):
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(n_estimators=20)
    rfr.fit(xtrain, ytrain)
    rfr_predictions = rfr.predict(xtest)
    score_it(rfr_predictions, ytest)

In [98]:
try_rfr(xtrain2, xtest2, ytrain2, ytest2)

top 10 score: 0.4
top 20 score: 0.55
top 50 score: 0.5


In [99]:
try_rfr(x_train, x_test, y_train, y_test)

top 10 score: 0.4
top 20 score: 0.55
top 50 score: 0.5


No effect of 'bag of 'words' columns.

---

Ada Boost for Decision Tree.

In [126]:
def try_abtree(xtrain, xtest, ytrain, ytest):
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import AdaBoostRegressor
    aboost =  AdaBoostRegressor(DecisionTreeRegressor(),n_estimators=20)
    aboost.fit(xtrain, ytrain)
    aboost_predictions = aboost.predict(xtest)
    score_it(aboost_predictions, ytest)

In [127]:
try_abtree(xtrain2, xtest2, ytrain2, ytest2)

top 10 score: 0.4
top 20 score: 0.6
top 50 score: 0.5


It improves the performance.

In [100]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [56]:
aboost =  AdaBoostRegressor(DecisionTreeRegressor(),n_estimators=90)

In [57]:
aboost.fit(xtrain2, ytrain2)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=90,
         random_state=None)

In [49]:
aboost.score(xtest2, ytest2)

0.26788585647645358

In [58]:
aboost_predictions = aboost.predict(xtest2)
aboost50_pred = aboost_predictions.argsort()[-50:][::-1]
aboost50_test = ytest2.argsort()[-50:][::-1]
print("top 10 score: {}".format(sum(np.in1d(aboost50_pred[:10],aboost50_test[:10]))/10))
print("top 20 score: {}".format(sum(np.in1d(aboost50_pred[:20],aboost50_test[:20]))/20))
print("top 50 score: {}".format(sum(np.in1d(aboost50_pred,aboost50_test))/50))

top 10 score: 0.5
top 20 score: 0.6
top 50 score: 0.56


In [59]:
from sklearn.linear_model import LinearRegression

In [84]:
aboost2 =  AdaBoostRegressor(LinearRegression(),n_estimators=400)

In [85]:
aboost2.fit(xtrain2, ytrain2)

AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
         learning_rate=1.0, loss='linear', n_estimators=400,
         random_state=None)

In [87]:
aboost2.score(xtest2, ytest2)

-0.33119830210966805

In [86]:
aboost2_predictions = aboost2.predict(xtest2)
aboost250_pred = aboost2_predictions.argsort()[-50:][::-1]
aboost250_test = ytest2.argsort()[-50:][::-1]
print("top 10 score: {}".format(sum(np.in1d(aboost250_pred[:10],aboost250_test[:10]))/10))
print("top 20 score: {}".format(sum(np.in1d(aboost250_pred[:20],aboost250_test[:20]))/20))
print("top 50 score: {}".format(sum(np.in1d(aboost250_pred,aboost250_test))/50))

top 10 score: 0.3
top 20 score: 0.4
top 50 score: 0.42


In [21]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
rfr = RandomForestRegressor(n_estimators=25)

In [None]:
rfr.fit(xtrain2, ytrain2)

In [30]:
rfr_predictions = rfr.predict(xtest2)
rfr50_pred = rfr_predictions.argsort()[-50:][::-1]
rfr50_test = ytest2.argsort()[-50:][::-1]
print("top 10 score: {}".format(sum(np.in1d(rfr50_pred[:10],rfr50_test[:10]))/10))
print("top 20 score: {}".format(sum(np.in1d(rfr50_pred[:20],rfr50_test[:20]))/20))
print("top 50 score: {}".format(sum(np.in1d(rfr50_pred,rfr50_test))/50))

top 10 score: 0.4
top 20 score: 0.55
top 50 score: 0.52


In [100]:
'''
dtree_scaler = StandardScaler()
xtrain_dtree = dtree_scaler.fit_transform(x_train)
xtest_dtree = dtree_scaler.transform(x_test)
dtree_scaler_y = StandardScaler()
ytrain_dtree = dtree_scaler_y.fit_transform(ytrain2.values.reshape(-1,1))
ytest_dtree = dtree_scaler_y.transform(ytest2.values.reshape(-1,1))
'''

'\ndtree_scaler = StandardScaler()\nxtrain_dtree = dtree_scaler.fit_transform(x_train)\nxtest_dtree = dtree_scaler.transform(x_test)\ndtree_scaler_y = StandardScaler()\nytrain_dtree = dtree_scaler_y.fit_transform(ytrain2.values.reshape(-1,1))\nytest_dtree = dtree_scaler_y.transform(ytest2.values.reshape(-1,1))\n'