In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ga-customer-revenue-prediction/sample_submission.csv
/kaggle/input/ga-customer-revenue-prediction/test_v2.csv
/kaggle/input/ga-customer-revenue-prediction/sample_submission_v2.csv
/kaggle/input/ga-customer-revenue-prediction/train_v2.csv
/kaggle/input/ga-customer-revenue-prediction/test.csv
/kaggle/input/ga-customer-revenue-prediction/train.csv


In [2]:
import numpy as np 
import pandas as pd 
import json
import os
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
%matplotlib inline

from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


import datetime
pd.options.display.max_columns = None

In [3]:
import torch 
from torch import nn
import seaborn as sns

# Google Analytics Customer Revenue Prediction Outline

1. Extract and load data

2. Exploratory Data analysis (EDA)

3. Data Preprocessing
    + 3.1 Data Cleaning and Transformation
    + 3.2 Data Pipeline (**Note: we can start from this step to load and transform data directly and skip step1~3.1。 This can help us save memory and speedup tunning models**)

4. Split dataset into training set and validation set

5. Modeling and Training
    + LGBM
    + Wide and Deep



## 1. Load Data and Format data

In [None]:
data_path = "../input/ga-customer-revenue-prediction/"
# train_df = pd.read_csv(data_path+ "train.csv")
# test_df = pd.read_csv(data_path+ "test.csv")

Extract json data from csv

In [None]:
def load_df(csv_path='kaggle/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path,converters={column: json.loads for column in JSON_COLUMNS},dtype={'fullVisitorId': 'str'},nrows=nrows)
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train = load_df(csv_path = data_path+ "train.csv")


In [None]:
train.head()

In [None]:
train.info()

# 2. Exploratory Data Analysis (EDA)

## 2.1 Visualize transaction revenue and visitor Id (index)

Since the goal is to predict the total revenue per user, so we want to explore the total_revenue grouped by each user

In [None]:
train["totals.transactionRevenue"] = train["totals.transactionRevenue"].astype('float')
gdf = train.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()

plt.figure(figsize=(8,6))
plt.scatter(range(gdf.shape[0]), np.sort(np.log1p(gdf["totals.transactionRevenue"].values)))
plt.xlabel('index', fontsize=12)
plt.ylabel('TransactionRevenue', fontsize=12)
plt.show()

In [None]:
print("Max transaction Revenue",train["totals.transactionRevenue"].max(),"Min transaction Revenue",train["totals.transactionRevenue"].min())

In [None]:
not_none_transaction = sum(gdf["totals.transactionRevenue"]>0)
print("There are %d transactions and %d transactions have consumptions, %.3f%% are transactions are >0"%( 
                                                                                len(gdf), not_none_transaction,
                                                                                100*not_none_transaction/len(gdf)))
not_none_records = pd.notnull(train["totals.transactionRevenue"]).sum()
print("There are %d out of %d records with NaN values. %.3f%% transactions are Nan "%(not_none_records, len(train["totals.transactionRevenue"]),
                                                                                      100*not_none_records/len(train["totals.transactionRevenue"])))

## 2.2 Visualize Relationship between Total Hits and Visitor Id (index of visitor)

In [None]:
# Compute the count of hits of each visitor
# convert object type to numerical data type
train["totals.hits"] = train["totals.hits"].astype("float", copy=False)
hits_cnt = train.groupby("fullVisitorId")[["totals.hits"]].sum().reset_index()

In [None]:
hits_cnt.info()

In [None]:
hits_cnt.info()

In [None]:
hits_cnt.max()

 Pick the top 100 users who contribute to hit most frequently

In [None]:
df = hits_cnt.sort_values(by="totals.hits", ascending=False).iloc[:100]
plt.figure(figsize=(8,6))
plt.scatter(range(df.shape[0]), df["totals.hits"])
plt.xlabel('visitor-Id index', fontsize=12)
plt.ylabel('hits', fontsize=12)
plt.show()

Use histogram plot to see the distribution of hits

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20,8))
_ = plt.hist( x= df["totals.hits"],orientation= "vertical",bins=50)

we can see most of users hit and visit website with hits <= 1000

In [None]:
train["totals.visits"] = train["totals.visits"].astype("float", copy=False)
visits_cnt = train.groupby("fullVisitorId")[["totals.transactionRevenue","totals.visits"]].sum().reset_index()
df = visits_cnt.sort_values(by="totals.visits", ascending=False).iloc[:100]
plt.figure(figsize=(8,6))
plt.scatter(range(df.shape[0]), df["totals.visits"])
plt.xlabel('visitor-Id index', fontsize=12)
plt.ylabel('Visits', fontsize=12)
plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15,8))
ax = plt.hist( x= df["totals.visits"],orientation= "vertical",bins=50)
plt.xlabel("Visit amount")
plt.ylabel("Count")

In [None]:
df = visits_cnt.sort_values(by="totals.visits", ascending=False)[:1000]
plt.figure(figsize=(8,6))
plt.scatter( df["totals.visits"],df["totals.transactionRevenue"])
plt.ylabel('Total Revenue', fontsize=12)
plt.xlabel('Visits', fontsize=12)
plt.show()

## Visualize relationship between non-zero revenue and date

In [None]:


def scatter_plot(cnt_srs, color):
    trace = go.Scatter(
        x=cnt_srs.index[::-1],
        y=cnt_srs.values[::-1],
        showlegend=False,
        marker=dict(
            color=color,
        ),
    )
    return trace

train['date'] = train['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
cnt_srs = train.groupby('date')['totals.transactionRevenue'].agg(['size', 'count'])
cnt_srs.columns = ["count", "count of non-zero revenue"]
cnt_srs = cnt_srs.sort_index()
#cnt_srs.index = cnt_srs.index.astype('str')
trace1 = scatter_plot(cnt_srs["count"], 'red')
trace2 = scatter_plot(cnt_srs["count of non-zero revenue"], 'blue')

fig = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.08,
                          subplot_titles=["Date - Count", "Date - Non-zero Revenue count"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig['layout'].update(height=800, width=800, paper_bgcolor='rgb(233,233,233)', title="Date Plots")
py.iplot(fig, filename='date-plots')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20,8))
geoNetwork.country                                 903653 non-null  object
 27  
train[["geoNetwork.subContinent", "totals.transactionRevenue"]].astype("float", copy=False)
_ = plt.hist( x=  train["totals.visits"],orientation= "vertical",bins=50)

In [None]:
train.head()

In [None]:
train.info()

# 3. Data Preprocessing and Transformation

## 3.1 Data Preprocessing

In [None]:
train['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
train['trafficSource.isTrueDirect'].fillna(False, inplace=True)

# remove columns with only one distinct value
cols_to_drop = [col for col in train.columns if train[col].nunique(dropna=False) == 1]
train.drop(cols_to_drop, axis=1, inplace=True)

#only one not null value
train.drop(['trafficSource.campaignCode'], axis=1, inplace=True)

train.head()

In [None]:
num_cols = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue']

for col in num_cols:
    train[col] = train[col].fillna(0)
    train[col] = train[col].astype(float)
    train[col] = np.log1p(train[col])

In [None]:
train['trafficSource.adContent'] = train['trafficSource.adContent'].fillna(0)
train['trafficSource.keyword'] = train['trafficSource.keyword'].fillna(0)
train['trafficSource.adwordsClickInfo.adNetworkType'] = train['trafficSource.adwordsClickInfo.adNetworkType'].fillna(0)
train['trafficSource.adwordsClickInfo.gclId'] = train['trafficSource.adwordsClickInfo.gclId'].fillna(0)
train['trafficSource.adwordsClickInfo.page'] = train['trafficSource.adwordsClickInfo.page'].fillna(0)
train['trafficSource.adwordsClickInfo.slot'] = train['trafficSource.adwordsClickInfo.slot'].fillna(0)

In [None]:
train.head()

In [None]:
train['device.browser'].nunique(),train['device.deviceCategory'].nunique()

In [None]:
train['browser_category'] = train['device.browser'] + '_' + train['device.deviceCategory']
train['browser_operatingSystem'] = train['device.browser'] + '_' + train['device.operatingSystem']
train['source_country'] = train['trafficSource.source'] + '_' + train['geoNetwork.country']

In [None]:
no_use = ["date", "fullVisitorId", "sessionId", "visitId", "visitStartTime", 'totals.transactionRevenue', 'trafficSource.referralPath']
cat_cols = [col for col in train.columns if col not in num_cols and col not in no_use]

## Label Encoding to convert strings to labels

In [None]:
max_values = {}
for col in cat_cols:
    print(col)
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    max_values[col] = train[col].max() + 2  # 根据经验，比真实值大一点，效果较好

In [None]:
max_values

In [None]:
cat_col_labels1 = ["channelGrouping", "device.deviceCategory", "device.operatingSystem", "geoNetwork.continent",
                   "geoNetwork.subContinent", "trafficSource.adContent", "trafficSource.adwordsClickInfo.adNetworkType",
                   "trafficSource.adwordsClickInfo.isVideoAd", "trafficSource.adwordsClickInfo.page", "trafficSource.adwordsClickInfo.slot",
                   "trafficSource.campaign", "trafficSource.medium", "geoNetwork.region"]

cat_col_labels2 = ["browser_category", "browser_operatingSystem", "source_country", "device.browser", "geoNetwork.city",
                   "trafficSource.source", "trafficSource.keyword", "trafficSource.adwordsClickInfo.gclId", "geoNetwork.networkDomain",
                   "geoNetwork.country", "geoNetwork.metro", "geoNetwork.region"]

## 3.2 For Convenience, I write an integrated transformation function / data pipeline to clean all data once

In [4]:
def load_df(csv_path='kaggle/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path,converters={column: json.loads for column in JSON_COLUMNS},dtype={'fullVisitorId': 'str'},nrows=nrows)
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [5]:
data_path = "../input/ga-customer-revenue-prediction/"
#load train set
train = load_df(csv_path = data_path+ "train.csv")
#load testset data
test = load_df(csv_path = data_path+ "test.csv")


pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead



Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)


In [None]:
test_v2 = load_df(csv_path = data_path+ "test_v2.csv")


In [None]:
len(test_v2.columns)

In [16]:

num_cols = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue']

no_use = ["date", "fullVisitorId", "sessionId", "visitId", "visitStartTime", 'totals.transactionRevenue', 'trafficSource.referralPath']
    
# Lists of categorical data
cat_col_labels1 = ["channelGrouping", "device.deviceCategory", "device.operatingSystem", "geoNetwork.continent",
                   "geoNetwork.subContinent", "trafficSource.adContent", "trafficSource.adwordsClickInfo.adNetworkType",
                   "trafficSource.adwordsClickInfo.isVideoAd", "trafficSource.adwordsClickInfo.page", "trafficSource.adwordsClickInfo.slot",
                   "trafficSource.campaign", "trafficSource.medium", "geoNetwork.region"]

cat_col_labels2 = ["browser_category", "browser_operatingSystem", "source_country", "device.browser", "geoNetwork.city",
                   "trafficSource.source", "trafficSource.keyword", "trafficSource.adwordsClickInfo.gclId", "geoNetwork.networkDomain",
                   "geoNetwork.country", "geoNetwork.metro", "geoNetwork.region"]


def transform_data(trainset, num_columns =num_cols):
    """
    Fill Missing values and create new  features
    """
    # fill missing values
    trainset['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
    trainset['trafficSource.isTrueDirect'].fillna(False, inplace=True)

    # remove columns with only one distinct value
    cols_to_drop = [col for col in trainset.columns if trainset[col].nunique(dropna=False) == 1]
    trainset.drop(cols_to_drop, axis=1, inplace=True)

    #Drop features
    if 'trafficSource.campaignCode' in trainset.columns:
        trainset.drop(['trafficSource.campaignCode'], axis=1, inplace=True)

    for col in num_columns:
        trainset[col] = trainset[col].fillna(0)
        trainset[col] = trainset[col].astype(float)
        trainset[col] = np.log1p(trainset[col])
    
    trainset['trafficSource.adContent'] = trainset['trafficSource.adContent'].fillna(0)
    trainset['trafficSource.keyword'] = trainset['trafficSource.keyword'].fillna(0)
    trainset['trafficSource.adwordsClickInfo.adNetworkType'] = trainset['trafficSource.adwordsClickInfo.adNetworkType'].fillna(0)
    trainset['trafficSource.adwordsClickInfo.gclId'] = trainset['trafficSource.adwordsClickInfo.gclId'].fillna(0)
    trainset['trafficSource.adwordsClickInfo.page'] = trainset['trafficSource.adwordsClickInfo.page'].fillna(0)
    trainset['trafficSource.adwordsClickInfo.slot'] = trainset['trafficSource.adwordsClickInfo.slot'].fillna(0)
    
    trainset['browser_category'] = trainset['device.browser'] + '_' + trainset['device.deviceCategory']
    trainset['browser_operatingSystem'] = trainset['device.browser'] + '_' + trainset['device.operatingSystem']
    trainset['source_country'] = trainset['trafficSource.source'] + '_' + trainset['geoNetwork.country']
    trainset['date'] = trainset['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    
    return trainset
    
    


def label_encoding(train,test,cat_cols):
    """
    Use label encoder to convert category data into numerical data before training embedding
    """
    #Label Encoding
    max_values = {}
    encoders = {}
    for col in cat_cols:
        print(col)
        lbl = LabelEncoder()
        df = pd.concat([train[col],test[col]],ignore_index=True)
        lbl.fit(df.values.astype('str',copy=False))
        encoders[col] = lbl
        df = lbl.transform(list(df.values.astype('str',copy=False)))
        train[col] = lbl.transform(list(train[col].values.astype('str')))
        test[col] = lbl.transform(list(test[col].values.astype('str',copy=False)))
        max_values[col] = df.max() + 2  # 根据经验，比真实值大一点，效果较好
        
    return train ,test,max_values, encoders



In [9]:

train = transform_data(train,num_columns =num_cols)


In [10]:
test_num_cols = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits']
test = transform_data(test,num_columns =test_num_cols)

In [86]:
test.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.hits,totals.pageviews,totals.newVisits,totals.bounces,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.referralPath,trafficSource.adContent,browser_category,browser_operatingSystem,source_country
0,4,2017-10-16,6167871330617112363,6167871330617112363_1508151024,1508151024,1.098612,1508151024,58,21,0,0,3,16,176,0,0,1,24691,1.609438,1.609438,0.0,0.0,4,208,5,11,1,0,0,0,0,1,,0,42,42,1972
1,4,2017-10-16,643697640977915618,0643697640977915618_1508175522,1508175522,0.693147,1508175522,58,13,0,0,4,19,185,23,0,948,30734,1.791759,1.791759,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,42,45,1980
2,4,2017-10-16,6059383810968229466,6059383810968229466_1508143220,1508143220,0.693147,1508143220,58,21,0,0,4,22,69,482,122,955,32535,2.079442,2.079442,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,42,42,1867
3,4,2017-10-16,2376720078563423631,2376720078563423631_1508193530,1508193530,0.693147,1508193530,99,16,1,1,2,12,218,73,100,540,0,2.197225,1.609438,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,106,140,2012
4,4,2017-10-16,2314544520795440038,2314544520795440038_1508217442,1508217442,0.693147,1508217442,99,21,0,0,2,12,218,73,100,751,0,2.302585,1.609438,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,105,136,2012


In [83]:
test_v2.head()

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.hits,totals.pageviews,totals.timeOnSite,totals.sessionQualityDim,totals.newVisits,totals.transactions,totals.transactionRevenue,totals.totalTransactionRevenue,totals.bounces,trafficSource.referralPath,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.adContent,trafficSource.isTrueDirect,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,browser_category,browser_operatingSystem,source_country
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",2018-05-11,7460955084541987166,"[{'hitNumber': '1', 'time': '0', 'hour': '21',...",1526099341,0.741276,1526099341,Chrome,Android,True,mobile,Asia,Southern Asia,India,Delhi,(not set),(not set),unknown.unknown,0.959135,0.869742,973,1,0.0,,,,0.0,(not set),(not set),google,organic,(not provided),(not set),True,0,0,0,0,True,Chrome_mobile,Chrome_Android,google_India
1,Direct,"[{'index': '4', 'value': 'North America'}]",2018-05-11,460252456180441002,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",1526064483,1.811234,1526064483,Chrome,Macintosh,False,desktop,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,San Francisco,(not set),0.959135,0.869742,49,1,0.0,,,,0.0,(not set),(not set),(direct),(none),(not set),(not set),True,0,0,0,0,True,Chrome_desktop,Chrome_Macintosh,(direct)_United States
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",2018-05-11,3461808543879602873,"[{'hitNumber': '1', 'time': '0', 'hour': '12',...",1526067157,0.741276,1526067157,Chrome,Chrome OS,False,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,onlinecomputerworks.com,0.959135,0.869742,24,1,0.0,,,,0.0,(not set),(not set),google,organic,(not provided),(not set),True,0,0,0,0,True,Chrome_desktop,Chrome_Chrome OS,google_United States
3,Direct,"[{'index': '4', 'value': 'North America'}]",2018-05-11,975129477712150630,"[{'hitNumber': '1', 'time': '0', 'hour': '23',...",1526107551,0.959135,1526107551,Chrome,iOS,True,mobile,Americas,Northern America,United States,Texas,Houston TX,Houston,(not set),1.026672,0.959135,25,1,0.0,,,,0.0,(not set),(not set),(direct),(none),(not set),(not set),True,0,0,0,0,True,Chrome_mobile,Chrome_iOS,(direct)_United States
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",2018-05-11,8381672768065729990,"[{'hitNumber': '1', 'time': '0', 'hour': '10',...",1526060254,0.526589,1526060254,Internet Explorer,Windows,True,tablet,Americas,Northern America,United States,California,Los Angeles CA,Irvine,com,1.026672,0.959135,49,1,0.526589,,,,0.0,(not set),(not set),google,organic,(not provided),(not set),False,0,0,0,0,True,Internet Explorer_tablet,Internet Explorer_Windows,google_United States


In [80]:
test_num_cols_v2 = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits']
test_v2 = transform_data(test_v2,num_columns =test_num_cols_v2)

ValueError: invalid literal for int() with base 10: '5-11'

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 37 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   channelGrouping                               804684 non-null  int64  
 1   date                                          804684 non-null  object 
 2   fullVisitorId                                 804684 non-null  object 
 3   sessionId                                     804684 non-null  object 
 4   visitId                                       804684 non-null  int64  
 5   visitNumber                                   804684 non-null  float64
 6   visitStartTime                                804684 non-null  int64  
 7   device.browser                                804684 non-null  int64  
 8   device.operatingSystem                        804684 non-null  int64  
 9   device.isMobile                               80

In [13]:
train.columns,len(train.columns), test.columns, len(test.columns)

(Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId', 'visitId',
        'visitNumber', 'visitStartTime', 'device.browser',
        'device.operatingSystem', 'device.isMobile', 'device.deviceCategory',
        'geoNetwork.continent', 'geoNetwork.subContinent', 'geoNetwork.country',
        'geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city',
        'geoNetwork.networkDomain', 'totals.hits', 'totals.pageviews',
        'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue',
        'trafficSource.campaign', 'trafficSource.source',
        'trafficSource.medium', 'trafficSource.keyword',
        'trafficSource.isTrueDirect', 'trafficSource.referralPath',
        'trafficSource.adwordsClickInfo.page',
        'trafficSource.adwordsClickInfo.slot',
        'trafficSource.adwordsClickInfo.gclId',
        'trafficSource.adwordsClickInfo.adNetworkType',
        'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.adContent',
        'browser_category', 'bro

Transform Categorical data for train set and test set

In [17]:
cat_cols = [col for col in train.columns if col not in num_cols and col not in no_use]
train,test ,max_values = label_encoding(train,test,cat_cols)

channelGrouping
device.browser
device.operatingSystem
device.isMobile
device.deviceCategory
geoNetwork.continent
geoNetwork.subContinent
geoNetwork.country
geoNetwork.region
geoNetwork.metro
geoNetwork.city
geoNetwork.networkDomain
trafficSource.campaign
trafficSource.source
trafficSource.medium
trafficSource.keyword
trafficSource.isTrueDirect
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.adContent
browser_category
browser_operatingSystem
source_country


In [19]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.hits,totals.pageviews,totals.bounces,totals.newVisits,totals.transactionRevenue,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.referralPath,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adContent,browser_category,browser_operatingSystem,source_country
0,4,2016-09-02,1131660440785968503,1131660440785968503_1472830385,1472830385,0.693147,1472830385,58,13,0,0,3,21,210,193,0,378,37454,0.693147,0.693147,0.693147,0.693147,0.0,4,208,5,11,0,,0,0,0,0,1,0,42,45,2004
1,4,2016-09-02,377306020877927890,377306020877927890_1472880147,1472880147,0.693147,1472880147,67,21,0,0,5,1,12,482,122,955,10098,0.693147,0.693147,0.693147,0.693147,0.0,4,208,5,11,0,,0,0,0,0,1,0,56,63,1814
2,4,2016-09-02,3895546263509774583,3895546263509774583_1472865386,1472865386,0.693147,1472865386,58,13,0,0,4,19,185,99,0,475,38725,0.693147,0.693147,0.693147,0.693147,0.0,4,208,5,11,0,,0,0,0,0,1,0,42,45,1980
3,4,2016-09-02,4763447161404445595,4763447161404445595_1472881213,1472881213,0.693147,1472881213,106,20,0,0,3,16,94,482,122,955,38725,0.693147,0.693147,0.693147,0.693147,0.0,4,208,5,1608,0,,0,0,0,0,1,0,117,155,1892
4,4,2016-09-02,27294437909732085,27294437909732085_1472822600,1472822600,1.098612,1472822600,58,1,1,1,4,13,217,482,122,955,38725,0.693147,0.693147,0.693147,0.0,0.0,4,208,5,11,1,,0,0,0,0,1,0,43,37,2011


In [20]:
test.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,geoNetwork.continent,geoNetwork.subContinent,geoNetwork.country,geoNetwork.region,geoNetwork.metro,geoNetwork.city,geoNetwork.networkDomain,totals.hits,totals.pageviews,totals.newVisits,totals.bounces,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword,trafficSource.isTrueDirect,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.referralPath,trafficSource.adContent,browser_category,browser_operatingSystem,source_country
0,4,2017-10-16,6167871330617112363,6167871330617112363_1508151024,1508151024,1.098612,1508151024,58,21,0,0,3,16,176,0,0,1,24691,1.609438,1.609438,0.0,0.0,4,208,5,11,1,0,0,0,0,1,,0,42,42,1972
1,4,2017-10-16,643697640977915618,0643697640977915618_1508175522,1508175522,0.693147,1508175522,58,13,0,0,4,19,185,23,0,948,30734,1.791759,1.791759,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,42,45,1980
2,4,2017-10-16,6059383810968229466,6059383810968229466_1508143220,1508143220,0.693147,1508143220,58,21,0,0,4,22,69,482,122,955,32535,2.079442,2.079442,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,42,42,1867
3,4,2017-10-16,2376720078563423631,2376720078563423631_1508193530,1508193530,0.693147,1508193530,99,16,1,1,2,12,218,73,100,540,0,2.197225,1.609438,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,106,140,2012
4,4,2017-10-16,2314544520795440038,2314544520795440038_1508217442,1508217442,0.693147,1508217442,99,21,0,0,2,12,218,73,100,751,0,2.302585,1.609438,0.693147,0.0,4,208,5,11,0,0,0,0,0,1,,0,105,136,2012


# 4. Split dataset into trainset and validation set

In [21]:
import datetime

train = train.sort_values('date')
# Split dataset into trainset and validation set according time
x_train = train[train["date"] <= pd.Timestamp(2017,5,31)]
x_val = train[train["date"] > pd.Timestamp(2017,5,31)]

y_train = x_train['totals.transactionRevenue']
y_val = x_val['totals.transactionRevenue']

x_train = x_train.drop(no_use, axis=1)
x_val = x_val.drop(no_use, axis=1)

In [22]:
num_cols.remove("totals.transactionRevenue")
num_cols

['visitNumber',
 'totals.hits',
 'totals.pageviews',
 'totals.bounces',
 'totals.newVisits']

In [23]:
emb_dims1 = []
emb_dims2 = []
for i in cat_col_labels1:
    emb_dims1.append((max_values[i], min((max_values[i]+1)//2, 50)))
for i in cat_col_labels2:
    emb_dims2.append((max_values[i], min((max_values[i]+1)//2, 50)))

# 5. Modeling

# 5.1  LGBM model

In [24]:
import lightgbm as lgb
train_set = lgb.Dataset(x_train, y_train)
valid_set = lgb.Dataset(x_val, y_val)


In [25]:
params = {
        'objective': 'regression',
        'metric': 'mse',
        'boosting': 'gbdt',
        'learning_rate': 0.01,
        'verbose': 0,
        'num_leaves': 120,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 100,
        'early_stopping_rounds':100,
        'num_rounds': 1000,
    }

%time model_f1 = lgb.train(params, train_set=train_set,  valid_sets=valid_set, verbose_eval=20)

# lightgbm.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None)


Found `num_rounds` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[20]	valid_0's l2: 4.12272
[40]	valid_0's l2: 3.81819
[60]	valid_0's l2: 3.60238
[80]	valid_0's l2: 3.445
[100]	valid_0's l2: 3.32732
[120]	valid_0's l2: 3.24142
[140]	valid_0's l2: 3.17627
[160]	valid_0's l2: 3.127
[180]	valid_0's l2: 3.08866
[200]	valid_0's l2: 3.06028
[220]	valid_0's l2: 3.03927
[240]	valid_0's l2: 3.02222
[260]	valid_0's l2: 3.00843
[280]	valid_0's l2: 2.99728
[300]	valid_0's l2: 2.98899
[320]	valid_0's l2: 2.98196
[340]	valid_0's l2: 2.97646
[360]	valid_0's l2: 2.97334
[380]	valid_0's l2: 2.97013
[400]	valid_0's l2: 2.96791
[420]	valid_0's l2: 2.96603
[440]	valid_0's l2: 2.96513
[460]	valid_0's l2: 2.96381
[480]	valid_0's l2: 2.96298
[500]	valid_0's l2: 2.9629
[520]	valid_0's l2: 2.96245
[540]	valid_0's l2: 2.96257
[560]	valid_0's l2: 2.96356
[580]	valid_0's l2: 2.9637
[600]	valid_0's l2: 2.96398
[620]	valid_0's l2: 2.96506
Early stopping, best iteration is:
[523]	valid_0's l2: 2.9623
CPU times: user 2m

## Transform testset and make predictions

In [30]:
test_no_use = ["date", "fullVisitorId", "sessionId", "visitId", "visitStartTime", 'trafficSource.referralPath']
test_df = test.drop(test_no_use, axis=1)
predictions = model_f1.predict(test_df)
predictions

array([ 0.00312927,  0.00312927,  0.00291488, ...,  0.20198197,
       -0.29401612, -0.03215272])

Example submissions

In [26]:
sample_df = pd.read_csv(data_path+ "sample_submission.csv")
sample_df.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,259678714014,0.0
1,49363351866189,0.0
2,53049821714864,0.0
3,59488412965267,0.0
4,85840370633780,0.0


## Save Prediction Results

In [32]:
results = pd.DataFrame({"fullVisitorId":test["fullVisitorId"].astype('str'),"PredictedLogRevenue":predictions})
results.to_csv("lgbm_prediction.csv",compression = 'gzip', index=False, float_format = '%.5f')

# 5.2 Wide and Deep Model

In [33]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

Create Pytorch Dataset

In [34]:
class TabularDataset(Dataset):
    def __init__(self, x_data, y_data, cat_cols1, cat_cols2, num_cols):
        
        """
        data: pandas data frame;
        cat_cols: list of string, the names of the categorical columns in the data, will be passed through the embedding layers;
        num_cols: list of string
        y_data: the target
        """
        self.n = x_data.shape[0]
        self.y = y_data.astype(np.float32).values.reshape(-1, 1)
       
        self.cat_cols1 = cat_cols1
        self.cat_cols2 = cat_cols2
        self.num_cols = num_cols
        
        self.num_X = x_data[self.num_cols].astype(np.float32).values
        self.cat_X1 = x_data[self.cat_cols1].astype(np.int64).values
        self.cat_X2 = x_data[self.cat_cols2].astype(np.int64).values
        
    
    def print_data(self):
        return self.num_X, self.cat_X1, self.cat_X2, self.y
    
    def __len__(self):
        """
        total number of samples
        """
        return self.n
    
    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.num_X[idx], self.cat_X1[idx], self.cat_X2[idx]]

In [35]:
class FeedForwardNN(nn.Module):
    def __init__(self, emb_dims1, emb_dims2, no_of_num, lin_layer_sizes, output_size, emb_dropout, lin_layer_dropouts):
        """
        emb_dims:           List of two element tuples;
        no_of_num:          Integer, the number of continuous features in the data;
        lin_layer_sizes:    List of integers. The size of each linear layer;
        output_size:        Integer, the size of the final output;
        emb_dropout:        Float, the dropout to be used after the embedding layers.
        lin_layer_dropouts: List of floats, the dropouts to be used after each linear layer.
        """
        super().__init__()
        
        # embedding layers
        self.emb_layers1 = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims1])
        self.emb_layers2 = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims2])
        
        # 计算各个emb参数数量，为后续Linear layer的输入做准备
        self.no_of_embs1 = sum([y for x, y in emb_dims1])
        self.no_of_embs2 = sum([y for x, y in emb_dims2])
        self.no_of_num = no_of_num
        
        # 分支1
        self.branch1 = nn.Linear(self.no_of_embs1, lin_layer_sizes[0])
        self.branch1_2 = nn.Linear(lin_layer_sizes[0], lin_layer_sizes[1])
        nn.init.kaiming_normal_(self.branch1.weight.data)
        nn.init.kaiming_normal_(self.branch1_2.weight.data)
        
        # 分支2
        self.branch2 = nn.Linear(self.no_of_embs2, lin_layer_sizes[0] * 2)
        self.branch2_2 = nn.Linear(lin_layer_sizes[0] * 2, lin_layer_sizes[1] * 2)
        nn.init.kaiming_normal_(self.branch2.weight.data)
        nn.init.kaiming_normal_(self.branch2_2.weight.data)
        
        # 主分支
        self.main_layer1 = nn.Linear(lin_layer_sizes[1] * 3 + self.no_of_num, lin_layer_sizes[2])
        self.main_layer2 = nn.Linear(lin_layer_sizes[2], lin_layer_sizes[3])
        
        # batch normal
        self.branch_bn_layers1 = nn.BatchNorm1d(lin_layer_sizes[0])
        self.branch_bn_layers2 = nn.BatchNorm1d(lin_layer_sizes[0] * 2)
        self.main_bn_layer = nn.BatchNorm1d(lin_layer_sizes[2])
        
        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.dropout_layers = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts])
        
        # Output layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)
        
    def forward(self, num_data, cat_data1, cat_data2):
        # embedding categorical feature and cat them together
        x1 = [emb_layer(torch.tensor(cat_data1[:, i])) for i, emb_layer in enumerate(self.emb_layers1)]
        x1 = torch.cat(x1, 1)
        
        x1 = self.emb_dropout_layer(F.relu(self.branch1(x1)))
        x1 = self.branch_bn_layers1(x1)
        x1 = self.dropout_layers[0](F.relu(self.branch1_2(x1)))

        x2 = [emb_layer(torch.tensor(cat_data2[:, i])) for i, emb_layer in enumerate(self.emb_layers2)]
        x2 = torch.cat(x2, 1)
        
        x2 = self.emb_dropout_layer(F.relu(self.branch2(x2)))
        x2 = self.branch_bn_layers2(x2)
        x2 = self.dropout_layers[0](F.relu(self.branch2_2(x2)))

        main = torch.cat([x1, x2, num_data], 1)

        main = self.dropout_layers[1](F.relu(self.main_layer1(main)))
        main = self.main_bn_layer(main)
        main = self.dropout_layers[2](F.relu(self.main_layer2(main)))

        out = self.output_layer(main)
        return out

In [36]:
train_dataset = TabularDataset(x_data=x_train, y_data=y_train, cat_cols1=cat_col_labels1, cat_cols2=cat_col_labels2, num_cols=num_cols)
val_dataset = TabularDataset(x_data=x_val, y_data=y_val, cat_cols1=cat_col_labels1, cat_cols2=cat_col_labels2, num_cols=num_cols)

In [37]:
batchsize = 64
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=0)
val_dataloder = DataLoader(val_dataset, 64, shuffle=True, num_workers=0)

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FeedForwardNN(emb_dims1=emb_dims1, 
                      emb_dims2=emb_dims2, 
                      no_of_num=len(num_cols),
                      lin_layer_sizes=[128,64,32,16],
                      output_size=1,
                      lin_layer_dropouts=[0.2, 0.2, 0.05],
                      emb_dropout=0.05).to(device)

## Training Wide and Deep model

In [53]:
no_of_epochs = 3
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
lrscheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.01, patience=2, threshold=0.9 )
total_data = train_dataset.__len__()

print_every = 500
steps = 0
running_loss = 0
best_val_score = 0
best_model = None

for epoch in range(no_of_epochs):
    model.train()
    for index, datas in enumerate(train_dataloader):
        steps += 1
        y, num_x, cat_x1, cat_x2 = datas
        cat_x1 = cat_x1.to(device)
        cat_x2 = cat_x2.to(device)
        num_x = num_x.to(device)
        y  = y.to(device)
        
        # Forward Pass
        optimizer.zero_grad()
        preds = model.forward(num_x, cat_x1, cat_x2)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if steps % print_every == 0:
            val_loss = 0
            model.eval()
            with torch.no_grad():
                for val_index, val_datas in enumerate(val_dataloder):
                    y, num_x, cat_x1, cat_x2 = val_datas
                    cat_x1 = cat_x1.to(device)
                    cat_x2 = cat_x2.to(device)
                    num_x = num_x.to(device)
                    y  = y.to(device)
                    
                    out = model.forward(num_x, cat_x1, cat_x2)
                    
                    batch_loss = criterion(out, y)
                    val_loss += batch_loss.item()
                
#             lrscheduler.step(val_loss)
            
            print(f"Epoch {epoch+1}/{no_of_epochs}.."
                     f"Train loss:{running_loss/print_every:.3f}.."
                     f"Validation loss:{val_loss/len(val_dataloder):.3f}..")
            running_loss = 0
            model.train()


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



Epoch 1/5..Train loss:3.882..Validation loss:4.335..
Epoch 1/5..Train loss:4.010..Validation loss:4.157..
Epoch 1/5..Train loss:3.940..Validation loss:3.957..
Epoch 1/5..Train loss:3.429..Validation loss:3.791..
Epoch 1/5..Train loss:3.384..Validation loss:3.688..
Epoch 1/5..Train loss:3.285..Validation loss:3.578..
Epoch 1/5..Train loss:3.387..Validation loss:3.497..
Epoch 1/5..Train loss:3.141..Validation loss:3.505..
Epoch 1/5..Train loss:3.067..Validation loss:3.542..
Epoch 1/5..Train loss:3.084..Validation loss:3.379..
Epoch 1/5..Train loss:3.121..Validation loss:3.400..
Epoch 1/5..Train loss:3.283..Validation loss:3.307..
Epoch 1/5..Train loss:3.385..Validation loss:3.275..
Epoch 1/5..Train loss:3.133..Validation loss:3.268..
Epoch 1/5..Train loss:3.157..Validation loss:3.291..
Epoch 1/5..Train loss:3.190..Validation loss:3.335..
Epoch 1/5..Train loss:3.180..Validation loss:3.328..
Epoch 1/5..Train loss:3.030..Validation loss:3.315..
Epoch 1/5..Train loss:3.276..Validation loss:3

KeyboardInterrupt: 

## make predictions on test set

In [54]:
test_no_use = ["date", "fullVisitorId", "sessionId", "visitId", "visitStartTime", 'trafficSource.referralPath']
test_num_cols = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits']
test_df = test.drop(test_no_use, axis=1)
test_dataset = TabularDataset(x_data=test_df, y_data=pd.DataFrame({"target":np.zeros(len(test_df))}), cat_cols1=cat_col_labels1, cat_cols2=cat_col_labels2, num_cols=test_num_cols)
test_dataloder = DataLoader(test_dataset, 128, shuffle=True, num_workers=2)

def make_predictions(model, test_dataloder):
        results = []
        model.eval()
        with torch.no_grad():
                for val_index, val_datas in enumerate(test_dataloder):
                    _, num_x, cat_x1, cat_x2 = val_datas
                    cat_x1 = cat_x1.to(device)
                    cat_x2 = cat_x2.to(device)
                    num_x = num_x.to(device)
                    
                    out = model.forward(num_x, cat_x1, cat_x2)
                    out = out.squeeze().to("cpu").numpy().tolist()
                    results.extend(out)
        return np.array(results)
#make predictions
predictions = make_predictions(model, test_dataloder)
predictions


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



array([ 0.03964843,  0.00381355, -0.0082958 , ...,  0.03341493,
        0.05822928,  3.40130639])

## Save submissions

In [72]:

results = pd.DataFrame({"fullVisitorId":test["fullVisitorId"].astype('str'),"PredictedLogRevenue":predictions})
results = results.groupby(['fullVisitorId']).sum(["PredictedLogRevenue"])
results['fullVisitorId'] = df.index
results["PredictedLogRevenue"] = np.log(results["PredictedLogRevenue"] +1)
results.to_csv("WideAndDeepModel_prediction.csv.gz",compression = 'gzip', index=False, float_format = '%.5f')

In [55]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617242 entries, 0 to 617241
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fullVisitorId        617242 non-null  object 
 1   PredictedLogRevenue  617242 non-null  float64
dtypes: float64(1), object(1)
memory usage: 9.4+ MB


In [75]:
results["fullVisitorId"].nunique()

617242

In [71]:
# df = results.groupby(['fullVisitorId']).sum(["PredictedLogRevenue"])
# df['fullVisitorId'] = df.index
# df.reset_index()
# df.reset_index()
df["PredictedLogRevenue"] = np.log(df["PredictedLogRevenue"] +1)
df

Unnamed: 0_level_0,PredictedLogRevenue,fullVisitorId
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1
0000000259678714014,0.042483,0000000259678714014
0000049363351866189,-0.098420,0000049363351866189
0000053049821714864,-0.046366,0000053049821714864
0000059488412965267,0.058635,0000059488412965267
0000085840370633780,-0.022062,0000085840370633780
...,...,...
9999905960465191827,0.113338,9999905960465191827
9999941518946450908,0.001001,9999941518946450908
9999969142283897422,0.001708,9999969142283897422
9999985820452794361,0.098471,9999985820452794361


In [77]:
sample_df_v2 = pd.read_csv(data_path+ "sample_submission_v2.csv")
len(sample_df_v2)


Columns (0) have mixed types.Specify dtype option on import or set low_memory=False.



296530