# Data Preparation

In [1]:
# Data wrangling
import pandas as pd
# Data splitting and normalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
conversion_sample_df = pd.read_csv("conversion_sample_data_understanding.csv")

In [3]:
conversion_sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20398 entries, 0 to 20397
Data columns (total 6 columns):
country                20398 non-null object
age                    20398 non-null int64
new_user               20398 non-null int64
source                 20398 non-null object
total_pages_visited    20398 non-null int64
converted              20398 non-null int64
dtypes: int64(4), object(2)
memory usage: 956.3+ KB


## Dummyfication

The categorical variables __country__ and __source__ are dummyfied.

In [4]:
conversion_sample_dummyfied_df = pd.get_dummies(conversion_sample_df, drop_first=True)

In [5]:
conversion_sample_dummyfied_df.head()

Unnamed: 0,age,new_user,total_pages_visited,converted,country_Germany,country_UK,country_US,source_Direct,source_Seo
0,17,1,8,0,1,0,0,1,0
1,52,1,6,0,0,0,1,0,1
2,35,1,6,0,0,0,0,1,0
3,24,1,1,0,0,0,1,1,0
4,32,1,5,0,0,0,1,0,1


## Partition

We separate the target variable from the other variables.

In [6]:
X = conversion_sample_dummyfied_df.drop("converted", axis=1)
y = conversion_sample_dummyfied_df["converted"]

We separate our data into training data and test data.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,age,new_user,total_pages_visited,country_Germany,country_UK,country_US,source_Direct,source_Seo
2950,27,0,6,0,0,0,0,1
9093,30,1,4,0,1,0,0,1
1393,38,1,5,0,0,1,0,1
5561,18,0,9,0,0,1,1,0
17276,31,0,15,0,1,0,0,1


In [9]:
X_test.head()

Unnamed: 0,age,new_user,total_pages_visited,country_Germany,country_UK,country_US,source_Direct,source_Seo
3482,46,1,3,1,0,0,0,0
9387,28,1,5,1,0,0,0,1
4269,52,1,4,1,0,0,1,0
20133,32,0,10,0,0,1,1,0
18788,33,0,11,0,0,1,0,1


## Standardization

We normalize the numerical features __age__ and __total_pages_visited__.

In [10]:
conversion_scaler = StandardScaler()

features_train_scaled = conversion_scaler.fit_transform(X_train.loc[:, ["age", "total_pages_visited"]])
X_train["age"] = features_train_scaled[:,0]
X_train["total_pages_visited"] = features_train_scaled[:,1]

In [11]:
X_train.head()

Unnamed: 0,age,new_user,total_pages_visited,country_Germany,country_UK,country_US,source_Direct,source_Seo
2950,-0.204283,0,-0.588531,0,0,0,0,1
9093,0.176354,1,-0.918939,0,1,0,0,1
1393,1.191385,1,-0.753735,0,0,1,0,1
5561,-1.346193,0,-0.092918,0,0,1,1,0
17276,0.303233,0,0.898306,0,1,0,0,1


In [12]:
features_test_scaled = conversion_scaler.transform(X_test.loc[:, ["age", "total_pages_visited"]])
X_test["age"] = features_test_scaled[:,0]
X_test["total_pages_visited"] = features_test_scaled[:,1]

In [13]:
X_test.head()

Unnamed: 0,age,new_user,total_pages_visited,country_Germany,country_UK,country_US,source_Direct,source_Seo
3482,2.206417,1,-1.084143,1,0,0,0,0
9387,-0.077404,1,-0.753735,1,0,0,0,1
4269,2.96769,1,-0.918939,1,0,0,1,0
20133,0.430112,0,0.072286,0,0,1,1,0
18788,0.556991,0,0.23749,0,0,1,0,1


## Next step : Modeling

We need to export the data for the further process.

In [14]:
X_train.to_csv("X_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)