# Project - Google Analytics Customer Revenue Preprocessing

## Presenting the initial data: 

<b>Data Fields: </b>

<b>fullVisitorIdv</b> - A unique identifier for each user of the Google Merchandise Store. <br>
<b>channelGrouping</b> - The channel via which the user came to the Store.<br>
<b>date</b> - The date on which the user visited the Store.<br>
<b>device </b>- The specifications for the device used to access the Store.<br>
<b>geoNetwork</b> - This section contains information about the geography of the user.<br>
<b>sessionId</b> - A unique identifier for this visit to the store.<br>
<b>socialEngagementType</b> - Engagement type, either "Socially Engaged" or "Not Socially Engaged".<br>
<b>totals</b> - This section contains aggregate values across the session.<br>
<b>trafficSource</b> - This section contains information about the Traffic Source from which the session originated.<br>
<b>visitId</b> - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.<br>
<b>visitNumber</b> - The session number for this user. If this is the first session, then this is set to 1.<br>
<b>visitStartTime</b> - The timestamp (expressed as POSIX time).<br>

# Objectives: 

The main objectives of this project are :

* Load the data so everything is in tabular format (some columns contain JSON so it you will need to find ways to separate those into independent columns)
* Identify the variables that need special processing (removing or infering missing values, removing columns that don't contain useful information)
* Run visualizations to better understand the data

## Importing necessary libraries

In [None]:
#import libraries
import os
import random 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 

from pandas.io.json import json_normalize 

from datetime import datetime

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

from statsmodels.api import OLS


plt.style.use('fivethirtyeight')

In [None]:
#load data
df = pd.read_csv("s3://full-stack-bigdata-datasets/Machine Learning Supervisé/projects/preprocessing_linear_models/Google_dataset_sample.csv") 

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
df.head()

In [None]:
#deal with date
def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
    df["_weekday"] = df['date'].dt.weekday
    df["_day"] = df['date'].dt.day
    df["_month"] = df['date'].dt.month
    df["_year"] = df['date'].dt.year
    df['_visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df

In [None]:
df_train = date_process(df)

In [None]:
#deal with missing values
def NumericalColumns(df): 
    df['totals.pageviews'].fillna(1, inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)
    df['totals.bounces'].fillna(0, inplace=True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].fillna(0.0).astype(float) 
    df['totals.pageviews'] = df['totals.pageviews'].astype(int) 
    df['totals.newVisits'] = df['totals.newVisits'].astype(int)
    df['totals.bounces'] = df['totals.bounces'].astype(int) 
    df["totals.hits"] = df["totals.hits"].astype(float)
    df['totals.visits'] = df['totals.visits'].astype(int)

    return df 

In [None]:
#normalize data
def Normalizing(df):
    df["totals.hits"] =  (df['totals.hits'] - min(df['totals.hits'])) / (max(df['totals.hits'])  - min(df['totals.hits']))
    df['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].apply(lambda x: np.log10(x+1))

    return df 

In [None]:
df_train = NumericalColumns(df_train)

df_train = Normalizing(df_train)

In [None]:
df_train.head()

In [None]:
# clean up the data
df_clean = df_train.drop(["date", "sessionId", "visitId", "visitNumber", "visitStartTime", "geoNetwork.region", "geoNetwork.metro", "geoNetwork.city", "geoNetwork.networkDomain",
"trafficSource.source",	"trafficSource.medium", "trafficSource.isTrueDirect",	"trafficSource.adwordsClickInfo.isVideoAd",	"trafficSource.campaignCode", "geoNetwork.continent",	"geoNetwork.subContinent", "_day"], axis = 1)

transform_to_string = ["_weekday", "_month", "_year", "_visitHour"]
for col in transform_to_string:
    df_clean[col] = df_clean[col].astype(str)

df_id = df_clean["fullVisitorId"]
df_no_id = df_clean.drop(["fullVisitorId"], axis=1)

object_variables = df_no_id.select_dtypes(include = "object")
non_object_variables = df_no_id.select_dtypes(exclude = "object")

category_to_replace = []
for col in object_variables :
    value_proportion_table = object_variables[col].value_counts()/len(object_variables)
    columns_to_replace = [col for col in value_proportion_table.keys() if value_proportion_table[col]>0.01]
    category_to_replace.append(columns_to_replace)

for i, col in enumerate(object_variables.columns) :
    object_variables[col] = np.where(object_variables[col].isin(category_to_replace[i]),object_variables[col], "others")

df_no_id = pd.concat([object_variables,non_object_variables], axis=1)

df_clean = pd.get_dummies(df_no_id, drop_first=True)
df_clean["fullVisitorId"] = df_id
df_clean.head()

In [None]:
#group and aggregate
df_agg = df_clean.groupby("fullVisitorId").sum()
df_agg.head()

In [None]:
#separate target and training variables
y = df_agg["totals.transactionRevenue"]
X = df_agg.drop(["totals.transactionRevenue"], axis=1)

In [None]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [None]:
#normalize data
sc = StandardScaler()

X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index= X_train.index)
X_test = pd.DataFrame(sc.fit_transform(X_test), columns = X_test.columns, index= X_test.index)

X_train.head()

In [None]:
#train model
X_train["constant"] = 1
X_test["constant"] = 1
model = OLS(y_train, X_train)

In [None]:
model_fit = model.fit()

In [None]:
model_fit.summary()

In [None]:
#remove highly correlated values
corr = X.corr()
high_corr = corr > 0.95
high_corr_list = [(i,j) for i in range(corr.shape[0]) for j in range(corr.shape[0]) if i != j and high_corr.iloc[i,j]]
high_corr_list

In [None]:
no_keep = []
unique_couples = []
for couple in high_corr_list :
    if (couple[1],couple[0]) not in unique_couples:
        unique_couples.append(couple)
        no_keep.append(couple[1])

X_train = X_train.drop(X_train.columns[no_keep], axis=1)
X_test = X_test.drop(X_test.columns[no_keep], axis=1)

In [None]:
print(X.columns[no_keep])

In [None]:
model = OLS(y_train, X_train)
model_fit = model.fit()
model_fit.summary()

In [None]:
#lasso and grid search
params = {'alpha' : [10**(-a) for a in range(10)]}
lasso = Lasso()
grid = GridSearchCV(lasso,param_grid=params, cv = 3, verbose=1)

grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)

In [None]:
best_model = grid.best_estimator_
print("Score on the train set :", best_model.score(X_train,y_train))
print("Score on the test set :", best_model.score(X_test,y_test))

In [None]:
print("columns that have been removed with lasso : ", X_train.columns[best_model.coef_==0])

In [None]:
print("columns that have been kept with lasso : ", X_train.columns[best_model.coef_!=0])

In [None]:
#try ridge
params = {'alpha':np.arange(0,1000,100)} 
ridge = Ridge()

grid = GridSearchCV(ridge, params, cv=3, verbose = 1)
grid_fit = grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
best_model = grid.best_estimator_
print("Score on the train set :", best_model.score(X_train,y_train))
print("Score on the test set :", best_model.score(X_test,y_test))