In [None]:
#import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
import pickle
import timestamp
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout, BatchNormalization
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import GridSearchCV   #Performing grid search
from sklearn.model_selection import validation_curve
import gc
pd.options.mode.chained_assignment= None  
pd.options.display.max_columns = 999

In [None]:
#Loading Data 
file_train = "../train_v2.csv"
file_test = "../test_v2.csv"
chunk_size = 10000


In [None]:
def load_data(file, chunk_size, nrows_load=None, test_data=False):
    df_res = pd.DataFrame()
    df_reader = pd.read_csv(file, dtype={ 'date': str, 'fullVisitorId': str}, chunksize=10000)
    
    for cidx, df in enumerate(df_reader):
        df.reset_index(drop=True, inplace=True)   
        process_df(df, test_data)
        df_res = pd.concat([df_res,df ], axis=0).reset_index(drop=True)
        del df #free memory
        gc.collect()
        #print every 20 iterations
        if cidx % 20 == 0:
            print('{}: rows loaded: {}'.format(cidx, df_res.shape[0]))
        if nrows_load:
            if res.shape[0] >= nrows_load:
                break
    return df_res

In [None]:
#every column as key and the important features to extract from each column
def parse_json(x,s):
    res = json.loads(x)
    try:
        return res[s]
    except:
        return float('NaN') 

    
def process_df(df,test_data):
    #process date 
    df['days'] = df['date'].str[-2:]
    df['days'] = df['days'].astype(int)
    df['month'] = df['date'].str[-4:-2]
    df['month'] = df['month'].astype(int)
    df['year'] = df['date'].str[:4]
    df['year'] = df['year'].astype(int)
    df['visitStartTime'] = df['visitStartTime'].astype('datetime64[s]')
    
    #process json fields
    process_dict = {
        'totals':['transactionRevenue','newVisits','pageviews','hits'] ,
        'trafficSource':['campaign','source','medium'] ,
        'device':['browser'],
        'geoNetwork': ['country','city','continent','region','subContinent']
    }
 
    #add new columns from json in df
    for c,l in process_dict.items():
        for it in l:
            df[it] = df[c].apply(lambda x : parse_json(x,it))
    
    #process time
    colA = ['visitStartTime']
    for v in colA:
        df.sort_values(["visitStartTime"], axis=0, ascending=True, inplace=True)     
    
    #labelencoding for continuous data
    cols = ['country','campaign','source','medium','continent','city','region','socialEngagementType','browser'
             ,'channelGrouping','subContinent','date']
    labelencoder_X=LabelEncoder()
    for c in cols:
        df.loc[:,c] = labelencoder_X.fit_transform(df.loc[:,c])
            
    #Dealing with missing values
    #transactionsRevenue and NewVisits:  nans ->  0
    df['transactionRevenue'].fillna(0,inplace=True)
    df['newVisits'].fillna(0,inplace=True)
    df['pageviews'].fillna(0,inplace=True)
    
    
    #Casting Str columns to int
    df['transactionRevenue'] = df['transactionRevenue'].astype('float32')
    df['newVisits']= df['newVisits'].astype('uint16')
    df['pageviews'] = df['pageviews'].astype('uint16')
    df['hits'] = df['hits'].astype('uint32')
    #df['index'] = df['index'].astype('uint32')
                                            
    #remove json field columns and some unwanted columns
    #(some removed for saving memory)
    rm_col = ['subContinent','channelGrouping','date','continent','customDimensions','fullVisitorId']
    if test_data:
        rm_col = rm_col[:-1]
    df.drop(list(process_dict.keys()) + rm_col, axis=1,inplace=True)
    
#load and process
df = load_data(file_train,chunk_size)
df_test =load_data(file_test,chunk_size,test_data=True)

In [None]:
df

In [None]:
df_test

In [None]:
def checkmissingvalues(df):
    return[df.isnull().sum()]
print(checkmissingvalues(df))

In [None]:
def checkmissingvalues(df_test):
    return[df_test.isnull().sum()]
print(checkmissingvalues(df_test))

In [None]:
#export to train_cleaned.csv
df.to_csv(path_or_buf="../train_cleaned.csv", chunksize = 10000, index=False, index_label=False)

In [None]:
#export to test_cleaned.csv
df_test.to_csv(path_or_buf="../test_cleaned.csv", chunksize = 10000, index=False, index_label=False)