In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
import math

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_csv('df_full.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85417 entries, 0 to 85416
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   channelGrouping                 85417 non-null  object 
 1   fullVisitorId                   85417 non-null  object 
 2   visitNumber                     85417 non-null  int64  
 3   browser                         85417 non-null  object 
 4   operatingSystem                 85417 non-null  object 
 5   isMobile                        85417 non-null  int64  
 6   deviceCategory                  85417 non-null  object 
 7   continent                       85417 non-null  object 
 8   subContinent                    85417 non-null  object 
 9   country                         85417 non-null  object 
 10  region                          85417 non-null  object 
 11  metro                           85417 non-null  object 
 12  city                            

In [4]:
df = df.drop(['fullVisitorId', 'transactions', 'total_transactions', 'transactionRevenue', 'totalTransactionRevenue'], axis=1)

In [5]:
categorical_feat = ['channelGrouping', 'browser_category', 'browser_operatingSystem', 'continent', 'subContinent',
                    'source_country', 'region', 'metro', 'city', 'campaign', 'medium', 'keyword', 
                    'referralPath', 'adContent', 'adwordsClickInfo.slot', 'adwordsClickInfo.adNetworkType']

In [6]:
df = pd.get_dummies(df)

## Label Encoder

In [None]:
for feature in categorical_feat:
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(list(df[feature].values.astype('str')))
    df[feature] = label_encoder.transform(list(df[feature].values.astype('str'))) 
    print("for this feature : {0} label-encoding was done succesfully".format(feature))

In [8]:
X = df.drop(columns='transaction_Complete', axis = 1)
y = df['transaction_Complete']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

## Standardize

In [None]:
#Call the StandardScaler`s fit method
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Linear Regression

In [None]:
# feature selection
def select_features(X_train, y_train, X_test):
    fs = SelectFromModel(RandomForestRegressor(n_estimators=1000), max_features=5)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

In [None]:
model = LinearRegression()
model.fit(X_train_fs, y_train)
y_pred = model.predict(X_test_fs)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Decision Tree

In [None]:
model = DecisionTreeRegressor()

parameters = {'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
               'splitter': ['best', 'random']}

In [None]:
dt_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               verbose=False, random_state=42, n_jobs = -1)
dt_random.fit(X, y)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", dt_random.best_estimator_)

In [None]:
y_pred = dt_random.predict(X)

In [None]:
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y, y_pred, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Random Forest Regession

In [None]:
model = RandomForestRegressor()

parameters = {'n_estimators': [5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100,200],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['mse', 'mae']}


In [None]:
rf_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               verbose=False, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rf_random.best_estimator_)

In [None]:
y_pred = rf_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Gradient Boosting Regession

In [None]:
model = GradientBoostingRegressor()

parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'learning_rate': [0.001,0.01,0.05,0.1,0.2,0.3],
               'n_estimators': [5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100,200],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['friedman_mse', 'mse', 'mae']}

In [None]:
gb_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               random_state=42, n_jobs = -1)
gb_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", gb_random.best_estimator_)

In [None]:
y_pred = gb_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## MLPRegressor

In [None]:
model = MLPRegressor(max_iter=100)

parameters = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
              'alpha': [0.001,0.01,0.05,0.1,0.2,0.3],
              'activation' : ['identity', 'logistic', 'tanh', 'relu'],
              'solver' : ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive']}

In [None]:
mlp_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, cv = 5, 
                               random_state=42, n_jobs = -1)
mlp_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", mlp_random.best_estimator_)

In [None]:
y_pred = mlp_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

# PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## Decision Tree Regressor With PCA

In [None]:
model = DecisionTreeRegressor()

parameters = {'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
               'splitter': ['best', 'random']}

In [None]:
dt_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               verbose=2, random_state=42, n_jobs = -1)
dt_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", dt_random.best_estimator_)

In [None]:
y_pred = dt_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Random Forest Regession With PCA

In [None]:
model = RandomForestRegressor()

parameters = {'n_estimators': [5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100,200],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['mse', 'mae']}


In [None]:
rf_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", rf_random.best_estimator_)

In [None]:
y_pred = rf_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Gradient Boosting Regession With PCA

In [None]:
model = GradientBoostingRegressor()

parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'learning_rate': [0.001,0.01,0.05,0.1,0.2,0.3],
               'n_estimators': [5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100,200],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [3,4,5,10,15,20,25,50,75,100,125,150,200, None],
               'min_samples_split': [3,10,50,100,150,200,250,300,350,400,450,500,550,600,650,700],
               'min_samples_leaf': [1,2,3,4,5,10,20,25, 30, 50, 75, 100],
               'criterion': ['friedman_mse', 'mse', 'mae']}

In [None]:
gb_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, n_iter = 10, cv = 5, 
                               random_state=42, n_jobs = -1)
gb_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", gb_random.best_estimator_)

In [None]:
y_pred = gb_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## MLPRegressor With PCA

In [None]:
model = MLPRegressor(max_iter=100)

parameters = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
              'alpha': [0.001,0.01,0.05,0.1,0.2,0.3],
              'activation' : ['identity', 'logistic', 'tanh', 'relu'],
              'solver' : ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive']}

In [None]:
mlp_random = RandomizedSearchCV(estimator = model, param_distributions = parameters, cv = 5, 
                               random_state=42, n_jobs = -1)
mlp_random.fit(X_train, y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", mlp_random.best_estimator_)

In [None]:
y_pred = mlp_random.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE: ', mse)
print('MAE: ', mae)
print('r2 Score: ', r2)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## Best Model Selection