In [1]:
import pandas as pd
import numpy as np
import warnings
from collections import Counter
warnings.filterwarnings('ignore')

In [2]:
train_url = "train4.csv"
train = pd.read_csv(train_url)
test_url = "test4.csv"
test = pd.read_csv(test_url)

--------------------------------------

# <font color='red'>Numeric and Categorical columns

In [3]:
numerics = train._get_numeric_data().columns.values.tolist()
categoricals = [col for col in train.columns.values if col not in numerics]

print('numeric columns:')
print(numerics)
print()
print('categorical columns:')
print(categoricals)

numeric columns:
['isMobile', 'hits', 'pageviews', 'bounces', 'newVisits', 'transactionRevenue', 'isTrueDirect', 'isVideoAd', 'visitNumber', 'year', 'month', 'day']

categorical columns:
['browser', 'operatingSystem', 'deviceCategory', 'continent', 'subContinent', 'country', 'region', 'metro', 'city', 'campaign', 'source', 'medium', 'channelGrouping', 'sessionId', 'weekday', 'partOfDay', 'domain']


### <font color='blue'> Removal of less-informative columns

In order to prevent curse of dimensionality, we can remove columns with a large number of categories, but low information:

In [4]:
train.drop(['continent','region','metro'], axis=1, inplace=True)
test.drop(['continent','region','metro'], axis=1, inplace=True)
categoricals.remove('continent')
categoricals.remove('region')
categoricals.remove('metro')

### <font color='blue'> Replacing categories with frequencies

As some columns contain values occured only in one of train or test sets, we set them to _others_: (E.g. _**country**_, _**source**_, _**city**_ and ...)

In [5]:
categoricals.remove('sessionId')
for col in categoricals:
    print(col)
    for val in train[col].unique().tolist():
        if val not in test[col].unique().tolist():
            train[col][train[col] == val] = 'others'

    for val in test[col].unique().tolist():
        if val not in train[col].unique().tolist():
            test[col][test[col] == val] = 'others'

browser
operatingSystem
deviceCategory
subContinent
country
city
campaign
source
medium
channelGrouping
weekday
partOfDay
domain


All categorical columns can be converted to integer by replacing their category with the frequency of the category: <font color='green'> __(WITHOUT THIS, WE WILL GET A MEMORY ERROR DURING PCA and XGBoost)__

In [6]:
for col in categoricals:
    print(col)
    train['temp'] = float('NaN')
    test['temp'] = float('NaN')
    for value in train[col].unique().tolist():
        train['temp'][train[col] == value] = Counter(train[col].tolist())[value]
        test['temp'][test[col] == value] = Counter(test[col].tolist())[value]
    
    train[col] = train['temp']
    test[col] = test['temp']
    train.drop('temp',axis=1, inplace=True)
    test.drop('temp',axis=1, inplace=True)

browser
operatingSystem
deviceCategory
subContinent
country
city
campaign
source
medium
channelGrouping
weekday
partOfDay
domain


------------------------------------

# <font color='red'> Writing the final dataframe to .csv

In [7]:
train.to_csv('train5.csv', index=False, encoding='utf-8')
test.to_csv('test5.csv', index=False, encoding='utf-8')