In [1]:
# Import needed libraries
import pandas as pd
import math, datetime
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
import re
from sklearn.cluster import KMeans

style.use('ggplot')
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df

Unnamed: 0,Page,Clicks,Impressions,CTR,Position
0,https://www.beautydiaries.gr/omorfoxamos/takal...,747,8158,9.16%,21.99
1,https://www.beautydiaries.gr/,683,3166,21.57%,12.21
2,https://www.beautydiaries.gr/omorfoxamos/takal...,430,3785,11.36%,10.81
3,https://www.beautydiaries.gr/omorfoxamos/takal...,248,8620,2.88%,29.88
4,https://www.beautydiaries.gr/omorfoxamos/takal...,135,1134,11.9%,9.73
5,https://www.beautydiaries.gr/beauty-news-ta-ne...,90,879,10.24%,15.15
6,https://www.beautydiaries.gr/beauty-news-ta-ne...,68,1172,5.8%,9.70
7,https://www.beautydiaries.gr/beauty-news-ta-ne...,59,617,9.56%,8.73
8,https://www.beautydiaries.gr/omorfoxamos/takal...,59,477,12.37%,16.99
9,https://www.beautydiaries.gr/apopsi/tade-efi/t...,51,845,6.04%,8.55


In [4]:
# Cleaning The Data
df['CTR'] = df['CTR'].apply(lambda x: float(x.replace('%', '')))

In [5]:
X = df[['Clicks', 'Impressions', 'Position']]
y = df.pop('CTR')

In [6]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [7]:
# Standardise The Data 
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)

In [9]:
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns, index = X_train.index)

In [10]:
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

In [11]:
# Import different Algorithms to see differences between their predictions
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
Linear_regression = LinearRegression()

In [13]:
Linear_regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
Linear_regression.score(X_train, y_train)

0.04587699725091532

In [15]:
Linear_regression.score(X_test, y_test)

-0.3732922110193282

In [16]:
RandomForestRegressorModel = RandomForestRegressor(n_estimators=50)
RandomForestRegressorModel.fit(X_train, y_train)
prediction_score = RandomForestRegressorModel.score(X_train, y_train)
test_score = RandomForestRegressorModel.score(X_test, y_test)

print(prediction_score, test_score)

0.994667472140154 0.9552351062920202


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipeline = make_pipeline(RandomForestRegressor(n_estimators=200))
 
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [5, 3, None]}
 
# Tune model using cross-validation pipeline
RandomForestRegressorModel = GridSearchCV(pipeline, hyperparameters, cv=5)
 
RandomForestRegressorModel.fit(X_train, y_train)
prediction_score = RandomForestRegressorModel.score(X_train, y_train)
print("The score of prediction for RandomForestRegressorModel is: {}".format(prediction_score))

The score of prediction for RandomForestRegressorModel is: 0.9967465853173039


### Let's Create A Function That Combines The Predictions For X_train, X_test and also contains the index number so that we can match it back up with the original dataframe.

In [18]:
X_list = [X_train, X_test]
y_list = y_train, y_test

RandomForestRegressorModel = RandomForestRegressor(n_estimators=50)

data_dict = {
    
    'Indexes': [],
    'Predictions': []    
    
}

for x, y in zip(X_list, y_list):
    
    indexes = list(x.index)
    RandomForestRegressorModel.fit(X_train, y_train)
    predictions = RandomForestRegressorModel.predict(x)
    
    data_dict['Indexes'].extend(indexes)
    data_dict['Predictions'].extend(predictions)

------------------------------------------------------------------------------------------------------------------------

In [19]:
data_dict

{'Indexes': [703,
  311,
  722,
  629,
  0,
  316,
  706,
  547,
  872,
  532,
  477,
  404,
  172,
  125,
  394,
  420,
  552,
  903,
  90,
  939,
  181,
  274,
  895,
  69,
  291,
  131,
  300,
  424,
  326,
  144,
  423,
  580,
  135,
  450,
  164,
  28,
  773,
  193,
  388,
  852,
  169,
  705,
  140,
  173,
  6,
  745,
  478,
  73,
  910,
  813,
  238,
  145,
  792,
  234,
  220,
  923,
  500,
  132,
  990,
  774,
  185,
  41,
  696,
  108,
  588,
  56,
  405,
  442,
  757,
  997,
  24,
  467,
  539,
  531,
  618,
  694,
  926,
  338,
  51,
  507,
  516,
  920,
  781,
  264,
  817,
  710,
  682,
  832,
  518,
  447,
  18,
  715,
  483,
  568,
  433,
  367,
  83,
  61,
  638,
  272,
  285,
  360,
  354,
  456,
  278,
  12,
  182,
  368,
  881,
  615,
  223,
  572,
  970,
  653,
  545,
  582,
  633,
  176,
  665,
  673,
  585,
  873,
  393,
  163,
  248,
  634,
  885,
  669,
  375,
  412,
  74,
  113,
  598,
  961,
  390,
  104,
  114,
  417,
  525,
  457,
  409,
  92,
  930,
  89,


------------------------------------------------------------------------------------------------------------------------------

### Merging X_train + X_test with Y_train + Y_test

In [20]:
merged_X = pd.concat([X_train, X_test])
merged_Y = pd.concat([y_train, y_test])

In [21]:
merged_df = pd.concat([merged_X, merged_Y], axis = 1)

In [22]:
merged_df.head(12)

Unnamed: 0,Clicks,Impressions,Position,CTR
703,-0.125275,-0.237756,-0.93101,0.0
311,-0.100183,-0.051416,-0.256306,1.15
722,-0.125275,-0.240418,0.568934,0.0
629,-0.125275,-0.205812,-0.714698,0.0
0,18.618823,21.433572,0.025781,9.16
316,-0.100183,-0.078036,-0.101023,1.3
706,-0.125275,-0.237756,0.145803,0.0
547,-0.125275,-0.144586,3.809544,0.0
872,-0.125275,-0.264376,0.491631,0.0
532,-0.125275,-0.125952,0.269216,0.0


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Predicted Data For Click Through Ratio's For All Pages

In [23]:
predicted_data = pd.DataFrame(data_dict)
predicted_data.set_index(predicted_data['Indexes'], inplace = True)
predicted_data.drop(columns=['Indexes'], inplace = True)
predicted_data.rename(columns={"Predictions": "Predicted_CTR_%"}, inplace = True)

In [24]:
predicted_data.head(6)

Unnamed: 0_level_0,Predicted_CTR_%
Indexes,Unnamed: 1_level_1
703,0.0
311,1.1372
722,0.0
629,0.0
0,10.6292
316,1.2842


----------------------------------------------------------------------------------------------------------------------------------------

### Merging The Original Dataframe to the predicted CTR (%).

In [27]:
final_df = pd.concat([merged_df, predicted_data], axis=1)

In [28]:
final_df

Unnamed: 0,Clicks,Impressions,Position,CTR,Predicted_CTR_%
703,-0.125275,-0.237756,-0.931010,0.00,0.0000
311,-0.100183,-0.051416,-0.256306,1.15,1.1372
722,-0.125275,-0.240418,0.568934,0.00,0.0000
629,-0.125275,-0.205812,-0.714698,0.00,0.0000
0,18.618823,21.433572,0.025781,9.16,10.6292
316,-0.100183,-0.078036,-0.101023,1.30,1.2842
706,-0.125275,-0.237756,0.145803,0.00,0.0000
547,-0.125275,-0.144586,3.809544,0.00,0.0000
872,-0.125275,-0.264376,0.491631,0.00,0.0000
532,-0.125275,-0.125952,0.269216,0.00,0.0000


In [29]:
final_df['CTR_Difference'] = final_df['Predicted_CTR_%'] - final_df['CTR']

In [30]:
### Now let's subset on the data and remove any CTR's that were 0.00

In [31]:
final_df = final_df[final_df['CTR'] != 0.00]

In [32]:
final_df.sort_values(by='CTR_Difference', ascending=False)

Unnamed: 0,Clicks,Impressions,Position,CTR,Predicted_CTR_%,CTR_Difference
469,-0.100183,-0.277686,-1.397538,50.00,72.0000,22.0000
3,6.097665,22.663415,0.560797,2.88,11.2788,8.3988
2,10.664500,9.792653,-0.732328,11.36,16.9944,5.6344
9,1.154442,1.966378,-0.885577,6.04,8.6016,2.5616
6,1.581015,2.836852,-0.807597,5.80,7.9446,2.1446
455,-0.100183,-0.267038,0.026459,16.67,18.4542,1.7842
17,0.778055,2.227254,-0.781151,3.82,5.2978,1.4778
0,18.618823,21.433572,0.025781,9.16,10.6292,1.4692
10,1.104257,1.827954,-0.533647,6.18,7.5774,1.3974
440,-0.100183,-0.251066,0.088843,8.33,9.5070,1.1770


In [33]:
final_df = final_df[['CTR', 'Predicted_CTR_%', 'CTR_Difference']]

In [34]:
original_data = df[['Page', 'Clicks', 'Impressions', 'Position']]

In [35]:
results = pd.merge(original_data, final_df, left_index=True, right_index=True)

In [36]:
results.to_csv('results.csv')

In [138]:
results

Unnamed: 0,Page,Clicks,Impressions,Position,CTR,Predicted_CTR_%,CTR_Difference
0,https://buffert.se/basta-aktierna-och-fonderna...,29585,331619,8.86,8.92,8.2136,-0.7064
1,https://buffert.se/loner-i-sverige/,19890,181742,6.31,10.94,9.5772,-1.3628
2,https://buffert.se/hur-mycket-far-jag-lana/,16497,362826,26.61,4.55,6.1140,1.5640
3,https://buffert.se/basta-bankerna-privatpersoner/,13347,147683,22.57,9.04,7.5192,-1.5208
4,https://buffert.se/vaxla-pengar-utomlands-bill...,13300,341707,14.74,3.89,5.5130,1.6230
...,...,...,...,...,...,...,...
904,https://buffert.se/author/svante-hansson/page/5/,0,1,17.00,0.00,0.0000,0.0000
905,https://buffert.se/rantor/#Drojsmalsranta,0,1,17.00,0.00,0.0000,0.0000
906,https://buffert.se/buffert-reser-till-japan-20...,0,1,30.00,0.00,0.0000,0.0000
907,https://buffert.se/fackforbund/#Vanliga-fragor,0,1,82.00,0.00,0.0000,0.0000
