In [8]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [9]:
train_url = "train2.csv"
train = pd.read_csv(train_url)
test_url = "test2.csv"
test = pd.read_csv(test_url)

# <font color='red'>Numeric and Categorical columns

In [10]:
numerics = train._get_numeric_data().columns.values.tolist()
categoricals = [col for col in train.columns.values if col not in numerics]

print('numeric columns:')
print(numerics)
print()
print('categorical columns:')
print(categoricals)

numeric columns:
['isMobile', 'hits', 'pageviews', 'bounces', 'newVisits', 'transactionRevenue', 'isTrueDirect', 'isVideoAd', 'date', 'visitNumber', 'visitStartTime']

categorical columns:
['browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'networkDomain', 'campaign', 'source', 'medium', 'keyword', 'referralPath', 'channelGrouping', 'sessionId']


# <font color='red'>Making valid types for boolean variables

Before going further, we need to check if all columns are correctly assigned to their categories based on their types:

In [11]:
for col in categoricals:
    print(col)
    print(train[col].describe())
    print()

browser
count     902755
unique        54
top       Chrome
freq      619699
Name: browser, dtype: object

operatingSystem
count      902755
unique         20
top       Windows
freq       349711
Name: operatingSystem, dtype: object

deviceCategory
count      902755
unique          3
top       desktop
freq       663814
Name: deviceCategory, dtype: object

continent
count       902755
unique           6
top       Americas
freq        449996
Name: continent, dtype: object

subContinent
count               902755
unique                  23
top       Northern America
freq                390295
Name: subContinent, dtype: object

country
count            902755
unique              222
top       United States
freq             364401
Name: country, dtype: object

region
count                            902755
unique                              376
top       not available in demo dataset
freq                             507780
Name: region, dtype: object

metro
count                            9

For categoricals, everything looks fine so far. <font size="+2">&#128077;

Now we will have the same procedure this time for numerics:

In [12]:
for col in numerics:
    print(col)
    print(train[col].describe())
    print()

isMobile
count     902755
unique         2
top        False
freq      663866
Name: isMobile, dtype: object

hits
count    902755.000000
mean          4.591720
std           9.634079
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max         500.000000
Name: hits, dtype: float64

pageviews
count    902755.000000
mean          3.846008
std           7.019751
min           1.000000
25%           1.000000
50%           1.000000
75%           4.000000
max         469.000000
Name: pageviews, dtype: float64

bounces
count    902755.000000
mean          0.498892
std           0.499999
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: bounces, dtype: float64

newVisits
count    902755.000000
mean          0.778101
std           0.415523
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: newVisits, dtype: float64

Some of the variables in numerics are still boolean and must be converted to integer:

In [13]:
train["isMobile"][train["isMobile"] == True] = 1
train["isMobile"][train["isMobile"] == False] = 0
train["isVideoAd"][train["isVideoAd"] == True] = 1
train["isVideoAd"][train["isVideoAd"] == False] = 0
train["isTrueDirect"][train["isTrueDirect"] == True] = 1
train["isTrueDirect"][train["isTrueDirect"] == False] = 0

test["isMobile"][test["isMobile"] == True] = 1
test["isMobile"][test["isMobile"] == False] = 0
test["isVideoAd"][test["isVideoAd"] == True] = 1
test["isVideoAd"][test["isVideoAd"] == False] = 0
test["isTrueDirect"][test["isTrueDirect"] == True] = 1
test["isTrueDirect"][test["isTrueDirect"] == False] = 0

-------------------------------------------------------

# <font color='red'>Dealing with times and dates

Columns *date* and *visitStartTime* ,which represent date and time, must be handled somehow.

### <font color='blue'>Date

At first we split *date* into day, month and year:

In [14]:
train["date"] = train["date"].apply(str)
train["year"] = train["date"].str.slice(0, 4)
train["month"] = train["date"].str.slice(4, 6)
train["day"] = train["date"].str.slice(6, 8)

test["date"] = test["date"].apply(str)
test["year"] = test["date"].str.slice(0, 4)
test["month"] = test["date"].str.slice(4, 6)
test["day"] = test["date"].str.slice(6, 8)

Now we can remove *date* from the dataset:

In [15]:
train.drop('date',axis=1, inplace=True)
test.drop('date',axis=1, inplace=True)

numerics.remove('date')
categoricals.append('year')
categoricals.append('month')
categoricals.append('day')

### <font color='blue'>Time 

Now we are going to deal with *visitStartTime*. This variable will be splitted into two variables:
<ol>
1) *weekday*, which indicates the day of the week<br>
2) *startHour*, which indicates the hour, at which the transaction was started
</ol>

The next cell is only to import the date library and show how it works:

In [16]:
from datetime import datetime
print(datetime.utcfromtimestamp(train["visitStartTime"][0]).strftime('%Y-%m-%d %H:%M:%S'))

2016-09-02 15:33:05


Now the weekday is extracted from *visitStartTime* and attached to the dataframe:

In [17]:
train["weekday"] = train["visitStartTime"].apply(datetime.utcfromtimestamp).apply(datetime.weekday)
test["weekday"] = test["visitStartTime"].apply(datetime.utcfromtimestamp).apply(datetime.weekday)

And the hour:

In [19]:
train["startHour"] = train["visitStartTime"].apply(datetime.utcfromtimestamp).dt.hour
test["startHour"] = test["visitStartTime"].apply(datetime.utcfromtimestamp).dt.hour

Now we can remove *visitStartTime* from the dataset and add our new features:

In [21]:
train.drop('visitStartTime',axis=1, inplace=True)
test.drop('visitStartTime',axis=1, inplace=True)
numerics.remove('visitStartTime')

numerics.append('weekday')
numerics.append('startHour')

<pre> <font color='green' size="+1"> __We could have extracted all the new columns from one of the variables *date* and *visitStartTime*!__ &#x1F60E;

# <font color='red'> Writing the final dataframe to .csv

In [22]:
train.to_csv('train3.csv', index=False, encoding='utf-8')
test.to_csv('test3.csv', index=False, encoding='utf-8')