In [1]:
# Import needed libraries
import pandas as pd
import math, datetime
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
import re
from sklearn.cluster import KMeans

style.use('ggplot')
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Page,Clicks,Impressions,CTR,Position
0,https://www.beautydiaries.gr/omorfoxamos/takal...,747,8158,9.16%,21.99
1,https://www.beautydiaries.gr/,683,3166,21.57%,12.21
2,https://www.beautydiaries.gr/omorfoxamos/takal...,430,3785,11.36%,10.81
3,https://www.beautydiaries.gr/omorfoxamos/takal...,248,8620,2.88%,29.88
4,https://www.beautydiaries.gr/omorfoxamos/takal...,135,1134,11.9%,9.73
5,https://www.beautydiaries.gr/beauty-news-ta-ne...,90,879,10.24%,15.15
6,https://www.beautydiaries.gr/beauty-news-ta-ne...,68,1172,5.8%,9.70
7,https://www.beautydiaries.gr/beauty-news-ta-ne...,59,617,9.56%,8.73
8,https://www.beautydiaries.gr/omorfoxamos/takal...,59,477,12.37%,16.99
9,https://www.beautydiaries.gr/apopsi/tade-efi/t...,51,845,6.04%,8.55


In [4]:
# Cleaning The Data
df['CTR'] = df['CTR'].apply(lambda x: float(x.replace('%', '')))

In [5]:
X = df[['Clicks', 'Impressions', 'Position']]
y = df.pop('CTR')

In [6]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [7]:
# Standardise The Data 
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)

In [9]:
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns, index = X_train.index)

In [10]:
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [11]:
# Import different Algorithms to see differences between their predictions
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
Linear_regression = LinearRegression()

In [13]:
Linear_regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
Linear_regression.score(X_train, y_train)

0.04587699725091532

In [15]:
Linear_regression.score(X_test, y_test)

-0.3732922110193282

In [16]:
RandomForestRegressorModel = RandomForestRegressor(n_estimators=50)
RandomForestRegressorModel.fit(X_train, y_train)
prediction_score = RandomForestRegressorModel.score(X_train, y_train)
test_score = RandomForestRegressorModel.score(X_test, y_test)

print(prediction_score, test_score)

0.994667472140154 0.9552351062920202


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipeline = make_pipeline(RandomForestRegressor(n_estimators=200))
 
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [5, 3, None]}
 
# Tune model using cross-validation pipeline
RandomForestRegressorModel = GridSearchCV(pipeline, hyperparameters, cv=5)
 
RandomForestRegressorModel.fit(X_train, y_train)
prediction_score = RandomForestRegressorModel.score(X_train, y_train)
print("The score of prediction for RandomForestRegressorModel is: {}".format(prediction_score))

### Let's Create A Function That Combines The Predictions For X_train, X_test and also contains the index number so that we can match it back up with the original dataframe.

In [None]:
X_list = [X_train, X_test]
y_list = y_train, y_test

RandomForestRegressorModel = RandomForestRegressor(n_estimators=50)

data_dict = {
    
    'Indexes': [],
    'Predictions': []    
    
}

for x, y in zip(X_list, y_list):
    
    indexes = list(x.index)
    RandomForestRegressorModel.fit(X_train, y_train)
    predictions = RandomForestRegressorModel.predict(x)
    
    data_dict['Indexes'].extend(indexes)
    data_dict['Predictions'].extend(predictions)

------------------------------------------------------------------------------------------------------------------------

In [None]:
data_dict

------------------------------------------------------------------------------------------------------------------------------

### Merging X_train + X_test with Y_train + Y_test

In [None]:
merged_X = pd.concat([X_train, X_test])
merged_Y = pd.concat([y_train, y_test])

In [None]:
merged_df = pd.concat([merged_X, merged_Y], axis = 1)

In [None]:
merged_df.head(12)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Predicted Data For Click Through Ratio's For All Pages

In [None]:
predicted_data = pd.DataFrame(data_dict)
predicted_data.set_index(predicted_data['Indexes'], inplace = True)
predicted_data.drop(columns=['Indexes'], inplace = True)
predicted_data.rename(columns={"Predictions": "Predicted_CTR_%"}, inplace = True)

In [None]:
predicted_data.head(6)

----------------------------------------------------------------------------------------------------------------------------------------

### Merging The Original Dataframe to the predicted CTR (%).

In [86]:
final_df = pd.concat([merged_df, predicted_data], axis=1)

In [87]:
final_df

Unnamed: 0,Clicks,Impressions,Position,CTR,Predicted_CTR_%
595,-0.133582,-0.314960,2.022484,0.00,0.0000
140,-0.123166,0.099501,-0.561524,0.15,0.1642
173,-0.128374,0.105610,-0.715761,0.07,0.0716
6,5.562623,3.851732,-0.269951,8.29,6.8456
585,-0.133582,-0.314679,0.371825,0.00,0.0000
...,...,...,...,...,...
835,-0.133582,-0.317242,-0.314321,0.00,0.0000
193,-0.130110,0.083983,-0.706254,0.05,0.0550
316,-0.133003,-0.228662,-0.663468,0.04,0.0672
581,-0.133582,-0.314504,-0.694633,0.00,0.0000


In [91]:
final_df['CTR_Difference'] = final_df['Predicted_CTR_%'] - final_df['CTR']

In [None]:
### Now let's subset on the data and remove any CTR's that were 0.00

In [96]:
final_df = final_df[final_df['CTR'] != 0.00]

Unnamed: 0,Clicks,Impressions,Position,CTR,Predicted_CTR_%,CTR_Difference
140,-0.123166,0.099501,-0.561524,0.15,0.1642,0.0142
173,-0.128374,0.105610,-0.715761,0.07,0.0716,0.0016
6,5.562623,3.851732,-0.269951,8.29,6.8456,-1.4444
73,-0.058937,0.002530,-0.496554,1.42,1.5926,0.1726
145,-0.123745,-0.231015,-0.700971,0.69,0.6940,0.0040
...,...,...,...,...,...,...
164,-0.127217,-0.228311,1.867718,0.43,0.3766,-0.0534
28,0.515660,0.300499,-0.278931,6.38,5.3966,-0.9834
193,-0.130110,0.083983,-0.706254,0.05,0.0550,0.0050
316,-0.133003,-0.228662,-0.663468,0.04,0.0672,0.0272


In [101]:
final_df.sort_values(by='CTR_Difference', ascending=False)

Unnamed: 0,Clicks,Impressions,Position,CTR,Predicted_CTR_%,CTR_Difference
44,0.118130,0.697265,-0.255161,1.51,4.7008,3.1908
8,2.818098,4.847947,-0.530887,3.47,5.5242,2.0542
4,7.562428,11.679617,-0.099339,3.89,5.5130,1.6230
2,9.412364,12.421082,0.527647,4.55,6.1140,1.5640
350,-0.133003,-0.312081,0.465846,0.67,2.1184,1.4484
...,...,...,...,...,...,...
5,5.599078,3.320639,-0.486518,9.56,6.3590,-3.2010
40,0.196826,-0.201488,-0.559939,17.30,13.9258,-3.3742
30,0.424813,0.072502,0.185366,8.69,5.2566,-3.4334
79,-0.067038,-0.276446,0.021621,9.87,5.0988,-4.7712


In [132]:
final_df = final_df[['CTR', 'Predicted_CTR_%', 'CTR_Difference']]

In [133]:
original_data = df[['Page', 'Clicks', 'Impressions', 'Position']]

In [135]:
results = pd.merge(original_data, final_df, left_index=True, right_index=True)

In [137]:
results.to_csv('results.csv')

In [138]:
results

Unnamed: 0,Page,Clicks,Impressions,Position,CTR,Predicted_CTR_%,CTR_Difference
0,https://buffert.se/basta-aktierna-och-fonderna...,29585,331619,8.86,8.92,8.2136,-0.7064
1,https://buffert.se/loner-i-sverige/,19890,181742,6.31,10.94,9.5772,-1.3628
2,https://buffert.se/hur-mycket-far-jag-lana/,16497,362826,26.61,4.55,6.1140,1.5640
3,https://buffert.se/basta-bankerna-privatpersoner/,13347,147683,22.57,9.04,7.5192,-1.5208
4,https://buffert.se/vaxla-pengar-utomlands-bill...,13300,341707,14.74,3.89,5.5130,1.6230
...,...,...,...,...,...,...,...
904,https://buffert.se/author/svante-hansson/page/5/,0,1,17.00,0.00,0.0000,0.0000
905,https://buffert.se/rantor/#Drojsmalsranta,0,1,17.00,0.00,0.0000,0.0000
906,https://buffert.se/buffert-reser-till-japan-20...,0,1,30.00,0.00,0.0000,0.0000
907,https://buffert.se/fackforbund/#Vanliga-fragor,0,1,82.00,0.00,0.0000,0.0000
