In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_url = "train1.csv"
train = pd.read_csv(train_url)
test_url = "test1.csv"
test = pd.read_csv(test_url)

--------------------------------------------------------------------------------------------------------------------------------

# <font color='red'>Numeric and Categorical columns

In [3]:
numerics = train._get_numeric_data().columns.values.tolist()
categoricals = [col for col in train.columns.values if col not in numerics]

print('numeric columns:')
print(numerics)
print()
print('categorical columns:')
print(categoricals)

numeric columns:
['isMobile', 'hits', 'pageviews', 'bounces', 'newVisits', 'transactionRevenue', 'page', 'date', 'visitNumber', 'visitStartTime']

categorical columns:
['browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'networkDomain', 'campaign', 'source', 'medium', 'keyword', 'isTrueDirect', 'referralPath', 'slot', 'gclId', 'adNetworkType', 'isVideoAd', 'adContent', 'channelGrouping', 'sessionId']


--------------------------------------------------------------------------------------------------------------------------------

# <font color='red'>Showing all missing values

In [4]:
d = {}
d_test = {}
types = []
for col in train.columns.values:
    d[col] = len(train[col][train[col].isna() == True])
    if col in numerics:
        types.append('numeric')
    else:
        types.append('categorical')
    #if len(train[col].isna())/train.shape[0] < 0.1:
        #train.drop(col,axis=1, inplace=True)
    if col in  test.columns.values:
        d_test[col] = len(test[col][test[col].isna() == True])
    else:
        d_test[col] = test.shape[0]
df = pd.DataFrame([], columns=['Column name', 'Missing values for train',"Missing values for test"])
df["Column name"] = d.keys()
df["Missing values for train"] = d.values()
df["Missing values for test"] = d_test.values()
df["Type"] = types
df["Train rate"] = df["Missing values for train"]/train.shape[0]
df["Test rate"] = df["Missing values for test"]/test.shape[0]
df.sort_values(by=['Missing values for train'], ascending=False)

Unnamed: 0,Column name,Missing values for train,Missing values for test,Type,Train rate,Test rate
27,adContent,891824,750085,categorical,0.987892,0.933101
8,transactionRevenue,891256,803863,numeric,0.987262,1.0
22,page,881328,750073,numeric,0.976265,0.933086
23,slot,881328,750073,categorical,0.976265,0.933086
26,isVideoAd,881328,750073,categorical,0.976265,0.933086
25,adNetworkType,881328,750073,categorical,0.976265,0.933086
24,gclId,881227,750025,categorical,0.976153,0.933026
20,isTrueDirect,629094,543675,categorical,0.69686,0.676328
21,referralPath,572048,568740,categorical,0.633669,0.707509
19,keyword,502486,390615,categorical,0.556614,0.485922


------------------------

# <font color='red'>Overview of columns

In [5]:
for col in train.columns.values:
    print(col,":")
    print(train[col].describe())
    print()

browser :
count     902755
unique        54
top       Chrome
freq      619699
Name: browser, dtype: object

operatingSystem :
count      902755
unique         20
top       Windows
freq       349711
Name: operatingSystem, dtype: object

isMobile :
count     902755
unique         2
top        False
freq      663866
Name: isMobile, dtype: object

deviceCategory :
count      902755
unique          3
top       desktop
freq       663814
Name: deviceCategory, dtype: object

hits :
count    902755.000000
mean          4.591720
std           9.634079
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max         500.000000
Name: hits, dtype: float64

pageviews :
count    902655.000000
mean          3.846323
std           7.020076
min           1.000000
25%           1.000000
50%           1.000000
75%           4.000000
max         469.000000
Name: pageviews, dtype: float64

bounces :
count    450377.0
mean          1.0
std           0.0
min           1.

---------------------------

# <font color='red'>Variables with only one value and NaN

In [6]:
for col in train.columns.values:
    if len(train[col].unique()) == 2 and train[col].isnull().values.any():
        print(col)
        print(train[col].unique())

bounces
[  1.  nan]
newVisits
[  1.  nan]
isTrueDirect
[nan True]
isVideoAd
[nan False]


It seems that for numeric columns *bounces* and *newVisits*, the missing values indicate 0, and for boolean columns *isTrueDirect* and *isVideoAd* the missing values indcate __False__ and __True__ respectively. Therefore instead of removing them, we impute their missing values:

In [7]:
train["bounces"].fillna(0,inplace=True)
train["newVisits"].fillna(0,inplace=True)
train["isTrueDirect"].fillna(False,inplace=True)
train["isVideoAd"].fillna(True,inplace=True)

test["bounces"].fillna(0,inplace=True)
test["newVisits"].fillna(0,inplace=True)
test["isTrueDirect"].fillna(False,inplace=True)
test["isVideoAd"].fillna(True,inplace=True)

Now it's better to convert boolean columns to numerics, 0 represents __False__ and 1 represents __True__:

In [8]:
train["isTrueDirect"][train["isTrueDirect"] == False] = 0.0
train["isTrueDirect"][train["isTrueDirect"] == True] = 1.0

train["isVideoAd"][train["isVideoAd"] == False] = 0.0
train["isVideoAd"][train["isVideoAd"] == True] = 1.0

test["isTrueDirect"][test["isTrueDirect"] == False] = 0.0
test["isTrueDirect"][test["isTrueDirect"] == True] = 1.0

test["isVideoAd"][test["isVideoAd"] == False] = 0.0
test["isVideoAd"][test["isVideoAd"] == True] = 1.0

It's worth to check if there are other boolean columns:

In [9]:
for col in train.columns.values:
    if False in train[col].unique() and len(train[col].unique()) == 2 :
        print(col)
        print(train[col].unique())
        print()

isMobile
[False  True]

bounces
[ 1.  0.]

newVisits
[ 1.  0.]

isTrueDirect
[False  True]

isVideoAd
[ True False]



Now we convert *isMobile* to numeric:

In [10]:
train["isMobile"][train["isMobile"] == False] = 0.0
train["isMobile"][train["isMobile"] == True] = 1.0

test["isMobile"][test["isMobile"] == False] = 0.0
test["isMobile"][test["isMobile"] == True] = 1.0

----------------------------------------------------------------------

# <font color='red'>A brief look into the target

*transactionRevenue* is the target and missing values probably indicate 0 revenue

In [11]:
print("Number of non-NaN values: ", len(train["transactionRevenue"][train["transactionRevenue"].isna() == False]))

Number of non-NaN values:  11499


### <font color='blue'>Converting target missings to 0</font>

In [12]:
train["transactionRevenue"].fillna(0,inplace=True)

----------------------------------------------------------------------

# <font color='red'>Imputing missing values

Let's see how many columns still contain missing values:

In [13]:
d = {}
d_test = {}
types = []
for col in train.columns.values:
    d[col] = len(train[col][train[col].isna() == True])
    if col in numerics:
        types.append('numeric')
    else:
        types.append('categorical')
    #if len(train[col].isna())/train.shape[0] < 0.1:
        #train.drop(col,axis=1, inplace=True)
    if col in  test.columns.values:
        d_test[col] = len(test[col][test[col].isna() == True])
    else:
        d_test[col] = test.shape[0]
df = pd.DataFrame([], columns=['Column name', 'Missing values for train',"Missing values for test"])
df["Column name"] = d.keys()
df["Missing values for train"] = d.values()
df["Missing values for test"] = d_test.values()
df["Type"] = types
df["Train rate"] = df["Missing values for train"]/train.shape[0]
df["Test rate"] = df["Missing values for test"]/test.shape[0]
df[df['Missing values for train'] > 0].sort_values(by=['Missing values for train'], ascending=False)

Unnamed: 0,Column name,Missing values for train,Missing values for test,Type,Train rate,Test rate
27,adContent,891824,750085,categorical,0.987892,0.933101
22,page,881328,750073,numeric,0.976265,0.933086
23,slot,881328,750073,categorical,0.976265,0.933086
25,adNetworkType,881328,750073,categorical,0.976265,0.933086
24,gclId,881227,750025,categorical,0.976153,0.933026
21,referralPath,572048,568740,categorical,0.633669,0.707509
19,keyword,502486,390615,categorical,0.556614,0.485922
5,pageviews,100,138,numeric,0.000111,0.000172


In [14]:
for col in df["Column name"][df['Missing values for train'] > 0]:
    print(col)
    print(train[col].describe())
    print()

pageviews
count    902655.000000
mean          3.846323
std           7.020076
min           1.000000
25%           1.000000
50%           1.000000
75%           4.000000
max         469.000000
Name: pageviews, dtype: float64

keyword
count             400269
unique              3658
top       (not provided)
freq              365950
Name: keyword, dtype: object

referralPath
count     330707
unique      1475
top            /
freq       75435
Name: referralPath, dtype: object

page
count    21427.000000
mean         1.008121
std          0.173717
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         14.000000
Name: page, dtype: float64

slot
count     21427
unique        2
top         Top
freq      20924
Name: slot, dtype: object

gclId
count                                                 21528
unique                                                17774
top       Cj0KEQjwmIrJBRCRmJ_x7KDo-9oBEiQAuUPKMufMpuG3Zd...
freq                        

For those categorical variables which have less than 70% missing values, missings are replaced with mode (categoricals) and median (numerics):

In [15]:
train["pageviews"].fillna(train["pageviews"].median(),inplace=True)
train["keyword"].fillna(train["keyword"].mode()[0],inplace=True)
train["referralPath"].fillna(train["referralPath"].mode()[0],inplace=True)

test["pageviews"].fillna(test["pageviews"].median(),inplace=True)
test["keyword"].fillna(test["keyword"].mode()[0],inplace=True)
test["referralPath"].fillna(test["referralPath"].mode()[0],inplace=True)

The other variables contain more than 97% missing values, so they can be removed:

In [16]:
for col in train.columns.values:
    if len(train[col][train[col].isna()==True])/train.shape[0] > 0.90:
        print(col)
        train.drop(col,axis=1, inplace=True)
        test.drop(col,axis=1, inplace=True)
        if col in categoricals:
            categoricals.remove(col)
        if col in numerics:
            numerics.remove(col)

page
slot
gclId
adNetworkType
adContent


Checking if there are still missing values in the dataset:

In [17]:
train.isnull().values.any()

False

# <font color='red'> Writing the final dataframe to .csv

In [18]:
train.to_csv('train2.csv', index=False, encoding='utf-8')
test.to_csv('test2.csv', index=False, encoding='utf-8')