# Import Libraries& Load Data

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os 
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
import time
import datetime 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import gc

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
 
from collections import Counter 
from statistics import mode 
    
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
import json 
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from itertools import product
import ast 
tqdm.pandas()
pd.set_option('display.max_columns',None)

## Load Data

In [2]:
#  parse_dates参数实际上是说明解析哪一列数据为日期格式
path = "./kdd2019/data/"

train_queries1 = pd.read_csv(path + 'train_queries_phase1.csv', parse_dates=['req_time'])
train_plans1   = pd.read_csv(path + 'train_plans_phase1.csv', parse_dates=['plan_time'])
train_clicks1  = pd.read_csv(path + 'train_clicks_phase1.csv')

train_queries2 = pd.read_csv(path + 'train_queries_phase2.csv', parse_dates=['req_time'])
train_plans2   = pd.read_csv(path + 'train_plans_phase2.csv', parse_dates=['plan_time'])
train_clicks2  = pd.read_csv(path + 'train_clicks_phase2.csv')

profiles      = pd.read_csv(path + 'profiles.csv') 

test_queries  = pd.read_csv(path + 'test_queries.csv', parse_dates=['req_time'])
test_plans    = pd.read_csv(path + 'test_plans.csv', parse_dates=['plan_time'])

In [3]:
train_plans = pd.concat([train_plans1, train_plans2], axis=0, ignore_index=True)
train_queries = pd.concat([train_queries1, train_queries2], axis=0, ignore_index=True)
train_clicks  = pd.concat([train_clicks1, train_clicks2],axis=0,ignore_index=True)

# Basic data set information analysis

## train_plans data set

### Basic information
1. train_plans 数据共 44.4M+ data;
2. Including 1 datetime64 feature(plan_time), 1 object feature(plans) and 1 int64 feature(sid);
3. train_plans data set has 1938572 records；

In [4]:
train_plans.head()

Unnamed: 0,sid,plan_time,plans
0,149233,2018-10-01 13:02:01,"[{""distance"": 2100, ""price"": 300, ""eta"": 1064,..."
1,337156,2018-11-23 10:48:25,"[{""distance"": 8887, ""price"": 400, ""eta"": 1631,..."
2,398930,2018-11-08 12:23:06,"[{""distance"": 8471, ""price"": 2300, ""eta"": 1284..."
3,196610,2018-11-27 12:11:59,"[{""distance"": 32405, ""price"": 2500, ""eta"": 477..."
4,302267,2018-10-18 17:22:19,"[{""distance"": 38511, ""price"": 700, ""eta"": 4715..."


In [5]:
train_plans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1938572 entries, 0 to 1938571
Data columns (total 3 columns):
 #   Column     Dtype         
---  ------     -----         
 0   sid        int64         
 1   plan_time  datetime64[ns]
 2   plans      object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 44.4+ MB


In [7]:
train_plans.shape

(1938572, 3)

### Missing data analysis
- There is no missing data in train_plans data set.

In [9]:
train_plans.isnull().sum()

sid          0
plan_time    0
plans        0
dtype: int64

### Nunique of each feature
1. sid:1938572 different values
2. plan_time: 1483606 different values
3. plans:1929875 different values 

In [10]:
# nuique去重后返回不同值的个数
train_plans.nunique()

sid          1938572
plan_time    1483606
plans        1929875
dtype: int64

## train_queries data set

### Basic information

1. train_queries data set has 76.3+M data;
2. train_queries data set has 1 datetime64 feature(req_time), 2 object features(o,d), 1 int64 feature(sid) and 1 float64 feature(pid);
3. train_queries data set has 2000000 records；

In [11]:
train_queries.head()

Unnamed: 0,sid,pid,req_time,o,d
0,419087,22813.0,2018-10-17 10:17:11,"116.34,39.99","116.37,39.86"
1,481938,,2018-10-13 15:50:15,"116.43,39.94","116.46,39.95"
2,441201,34393.0,2018-10-13 16:15:21,"116.28,40.01","116.39,39.99"
3,347266,,2018-10-22 13:26:04,"116.31,39.51","116.32,39.72"
4,206269,,2018-11-08 10:27:23,"116.33,39.89","116.59,39.76"


In [12]:
train_queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype         
---  ------    -----         
 0   sid       int64         
 1   pid       float64       
 2   req_time  datetime64[ns]
 3   o         object        
 4   d         object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 76.3+ MB


In [13]:
train_queries.shape

(2000000, 5)

### Missing data analysis
- pid has 676803 missing values, occupies 33.84%.

In [14]:
train_queries.isnull().sum()

sid              0
pid         676803
req_time         0
o                0
d                0
dtype: int64

In [15]:
train_queries["pid"].isnull().sum() / train_queries.shape[0]

0.3384015

### Nunique of each feature
1. sid:2000000 different values
2. pid: 97371 different values
3. req_time:1524599 different values 
4. o:14807 different values 
5. d:13445 different values 

In [16]:
train_queries.nunique()

sid         2000000
pid           97371
req_time    1524599
o             14807
d             13445
dtype: int64

## train_clicks data set

### Basic information
1. train_clicks data set has 38.3+M data;
2. train_clicks data set has  1 object features(click_time) and 2 int64 feature(sid,click_mode);
3. train_clicks data set has 1675183 records；

In [18]:
train_clicks.head()

Unnamed: 0,sid,click_time,click_mode
0,175043,2018-10-14 10:35:07,7
1,414568,2018-10-20 04:18:48,2
2,314669,2018-10-26 16:53:04,2
3,117837,2018-10-03 18:48:02,7
4,474691,2018-11-01 15:59:01,2


In [19]:
train_clicks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1675183 entries, 0 to 1675182
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   sid         1675183 non-null  int64 
 1   click_time  1675183 non-null  object
 2   click_mode  1675183 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 38.3+ MB


In [20]:
train_clicks.shape

(1675183, 3)

### Missing data analysis
- There is no missing data in train_plans data set.add

In [21]:
train_clicks.isnull().sum()

sid           0
click_time    0
click_mode    0
dtype: int64

### Nunique of each feature
1. sid:1675183 different values
2. click_time: 1316204 different values
3. click_mode:11 different values 

In [22]:
train_clicks.nunique()

sid           1675183
click_time    1316204
click_mode         11
dtype: int64

## profiles data set

### Basic information

1. profiles data set has 61.3MB data;
2. profiles data set has  1 int features(pid) and 66 int64 feature(p0 - p65);
3. profiles data set has 119856 records；

In [23]:
profiles.head()

Unnamed: 0,pid,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36,p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49,p50,p51,p52,p53,p54,p55,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65
0,20555,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,42213,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,29526,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,117756,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,185606,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [24]:
profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119856 entries, 0 to 119855
Data columns (total 67 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   pid     119856 non-null  int64  
 1   p0      119856 non-null  float64
 2   p1      119856 non-null  float64
 3   p2      119856 non-null  float64
 4   p3      119856 non-null  float64
 5   p4      119856 non-null  float64
 6   p5      119856 non-null  float64
 7   p6      119856 non-null  float64
 8   p7      119856 non-null  float64
 9   p8      119856 non-null  float64
 10  p9      119856 non-null  float64
 11  p10     119856 non-null  float64
 12  p11     119856 non-null  float64
 13  p12     119856 non-null  float64
 14  p13     119856 non-null  float64
 15  p14     119856 non-null  float64
 16  p15     119856 non-null  float64
 17  p16     119856 non-null  float64
 18  p17     119856 non-null  float64
 19  p18     119856 non-null  float64
 20  p19     119856 non-null  float64
 21  p20     11

In [25]:
profiles.shape

(119856, 67)

### Missing data analysis
- There is no missing data in profiles data set.

In [26]:
profiles.isnull().sum()

pid    0
p0     0
p1     0
p2     0
p3     0
      ..
p61    0
p62    0
p63    0
p64    0
p65    0
Length: 67, dtype: int64

### Nunique of each feature
1. pid:119856 different values
2. p0-p65: 2 different values 

In [27]:
profiles.nunique()

pid    119856
p0          2
p1          2
p2          2
p3          2
        ...  
p61         2
p62         2
p63         2
p64         2
p65         2
Length: 67, dtype: int64