In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

import argparse



In [2]:
print('Loading raw data...')
train_users_path='input/train_users_2.csv'
test_users_path='input/test_users.csv'
sessions_path='input/sessions.csv'

#Note: age_gender_bkts.csv and countries.csv files are not used.

#########Loading data#############
#train_users
df_train = pd.read_csv(train_users_path)
target = df_train['country_destination']
#df_train = df_train.drop(['country_destination'], axis=1)

#test_users
df_test = pd.read_csv(test_users_path)    
id_test = df_test['id']

#sessions
df_sessions = pd.read_csv(sessions_path)
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)

Loading raw data...


In [3]:
#df_train.date_account_created

In [4]:
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [5]:
df_train.gender.replace("-unknown-", "NAN", inplace=True)
#df_train.head()
df_test.gender.replace("-unknown-", "NAN", inplace=True)


In [6]:
pd.unique(df_train.gender.values)

array(['NAN', 'MALE', 'FEMALE', 'OTHER'], dtype=object)

In [7]:
#sessions grouping by user

#Group by user_id, aggregate by number of counts (counting device_type as it is never NA), 
#and total sum of elapsed time in seconds
group_sessions = df_sessions.groupby("id")
#.agg({'device_type':'count', 'secs_elapsed':'sum'})
#rename columns
# // group_sessions.columns = ['sum_secs_elapsed', 'counts']
#group by variable turns into index, I'm reseting the index and putting user_id back as a column
# //group_sessions.reset_index(level=0, inplace=True)

In [8]:
#group_sessions.head()

In [9]:
group_sessions = group_sessions.agg({'device_type':'count', 'secs_elapsed':'sum'})

In [10]:
group_sessions.head()

Unnamed: 0_level_0,secs_elapsed,device_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00023iyk9l,867896.0,40
0010k6l0om,586543.0,63
001wyh0pz8,282965.0,90
0028jgx1x1,297010.0,31
002qnbzfs5,6487080.0,789


In [11]:
group_sessions.columns = ['sum_secs_elapsed', 'counts']

In [12]:
group_sessions.head()

Unnamed: 0_level_0,sum_secs_elapsed,counts
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00023iyk9l,867896.0,40
0010k6l0om,586543.0,63
001wyh0pz8,282965.0,90
0028jgx1x1,297010.0,31
002qnbzfs5,6487080.0,789


In [13]:
group_sessions.reset_index(level=0, inplace=True)
group_sessions.head()

Unnamed: 0,id,sum_secs_elapsed,counts
0,00023iyk9l,867896.0,40
1,0010k6l0om,586543.0,63
2,001wyh0pz8,282965.0,90
3,0028jgx1x1,297010.0,31
4,002qnbzfs5,6487080.0,789


In [14]:
df_train.age[0:20]

0      NaN
1     38.0
2     56.0
3     42.0
4     41.0
5      NaN
6     46.0
7     47.0
8     50.0
9     46.0
10    36.0
11    47.0
12     NaN
13    37.0
14    36.0
15    33.0
16     NaN
17    31.0
18     NaN
19    29.0
Name: age, dtype: float64

In [15]:
df_train.age.replace(np.nan, 0, inplace=True)
df_train.age.astype(str, inplace=True)
df_train.head()
#df_train["age_bucket"] = ""
#df_train.age_bucket

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,NAN,0.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,NAN,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [16]:
#bucket all ages into format that user_demo is in for age
import math
def agebuckets(ages):
    ageless =  [i for i in range(5,101,5)] # 5, 10, 15, 20...95, 100
    buckets = ['%d-%d' %(i, i+4) for i in range(0,100,5)] # 0-4, 5-9, 10-14...90-94, 95-99
    newlist = []
    for i in range(len(ages)):
        if math.isnan(ages[i]):
            newlist.append('NA')
        elif ages[i] <ageless[0]:
            newlist.append(buckets[0])
        elif ages[i] < ageless[1]:
            newlist.append(buckets[1])
        elif ages[i] < ageless[2]:
            newlist.append(buckets[2])
        elif ages[i] < ageless[3]:
            newlist.append(buckets[3])
        elif ages[i] < ageless[4]:
            newlist.append(buckets[4])
        elif ages[i] < ageless[5]:
            newlist.append(buckets[5])
        elif ages[i] < ageless[6]:
            newlist.append(buckets[6])
        elif ages[i] < ageless[7]:
            newlist.append(buckets[7])
        elif ages[i] < ageless[8]:
            newlist.append(buckets[8])
        elif ages[i] < ageless[9]:
            newlist.append(buckets[9])
        elif ages[i] < ageless[10]:
            newlist.append(buckets[10])
        elif ages[i] < ageless[11]:
            newlist.append(buckets[11])
        elif ages[i] < ageless[12]:
            newlist.append(buckets[12]) 
        elif ages[i] < ageless[13]:
            newlist.append(buckets[13]) 
        elif ages[i] < ageless[14]:
            newlist.append(buckets[14])
        elif ages[i] < ageless[15]:
            newlist.append(buckets[15])
        elif ages[i] < ageless[16]:
            newlist.append(buckets[16])
        elif ages[i] < ageless[17]:
            newlist.append(buckets[17])
        elif ages[i] < ageless[18]:
            newlist.append(buckets[18])
        elif ages[i] < ageless[19]:
            newlist.append(buckets[19]) 
        else:
            newlist.append('100+')
    return newlist

In [17]:
df_train.age = agebuckets(df_train.age)
df_test.age = agebuckets(df_test.age)

In [18]:
df_train.date_account_created = pd.to_datetime(df_train.date_account_created)
df_train.timestamp_first_active = pd.to_datetime(df_train.timestamp_first_active, format = "%Y%m%d%H%M%S")
df_train.date_first_booking = pd.to_datetime(df_train.date_first_booking)
df_test.timestamp_first_active = pd.to_datetime(df_test.timestamp_first_active, format = "%Y%m%d%H%M%S")
df_test.date_account_created = pd.to_datetime(df_test.date_account_created)

In [19]:
def timedif(L1, L2):
    timediflist = []
    for i in range(len(L1)):
        try:
            if (L1[i]-L2[i]).days < 0:#datetime.timedelta(days=0):
                timediflist.append('before')
            elif (L1[i]-L2[i]).days ==0: #datetime.timedelta(days=1):
                timediflist.append('same day')
            else:
                timediflist.append('greater 1 day')
        except:
            timediflist.append('NB')
            
    return timediflist
    

In [20]:
#df_train.date_first_booking - df_train.date_account_created

In [21]:
#adding time lag columns
df_train['first_book_lag'] = timedif(df_train.date_first_booking, df_train.date_account_created)
df_train['account_active_lag'] = timedif(df_train.date_first_booking, df_train.timestamp_first_active)
df_train['account_created_lag'] = timedif(df_train.date_account_created, df_train.timestamp_first_active)

In [22]:
df_test['first_book_lag'] = "NAN"
df_test['account_created_lag'] = timedif(df_test.date_account_created, df_test.timestamp_first_active)
df_test['account_active_lag'] = "NAN"

In [23]:
#df_test['account_active_lag']

In [24]:
def bookings(L1, L2, L3, L4):
    timediflist = []
    for i in range(len(L1)):
        if L1[i] == 'same day' or L2[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'greater 1 day' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'greater 1 day' and L2[i] == 'before':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'greater 1 day':
            timediflist.append('waited')
        elif (len(L4) > 0 ) and L4[i] == 'NDF':
            timediflist.append('NB')
        else:
            timediflist.append('NA')

            
    return timediflist

In [25]:
booking = bookings(df_train.first_book_lag, df_train.account_active_lag, df_train.account_created_lag, target)
test_booking = bookings(df_test.first_book_lag, df_test.account_active_lag, df_test.account_created_lag, [])

In [26]:
df_train['bookings'] = booking


In [27]:
df_test["bookings"] = test_booking

In [28]:
#df_test.drop(["account_created_lag", "first_book_lag", "account_active_lag", "bookings"], axis=1, inplace=True)

In [29]:
#given the train data gender, age, and country_desination produce the corresponding population in thousands
countries = pd.read_csv('input/countries.csv')
user_demo = pd.read_csv('input/age_gender_bkts.csv')
#df_train.gender = df_train.gender.astype(str)

In [30]:
population_in_thous = []
for i in range(df_train.shape[0]):
    #print(type(df_train.loc[i, "gender"]))
    if target[i] == 'NDF':
        population_in_thous.append(-1) # NB = -1    
    elif df_train.gender[i] == 'NA' or df_train.age[i] == 'NA' or df_train.loc[i, "gender"].lower() == 'nan': 
        population_in_thous.append(-2) # NA = -2
    elif df_train.gender[i] == 'OTHER':
        population_in_thous.append(0)  
    elif target[i] == 'other':
        gendersi = user_demo.loc[user_demo.gender == df_train.loc[i, "gender"].lower(),:] 
        ages = gendersi.loc[gendersi.age_bucket == df_train.age[i], :]
        ages = list(map(lambda x: float(x), ages.population_in_thousands))
        population_in_thous.append(np.mean(ages))
    else:
        #print(df_train.loc[i, "gender"].lower())
        genders = user_demo.loc[user_demo.gender == df_train.loc[i, "gender"].lower(),:] 
        dests = genders.loc[genders.country_destination == df_train.country_destination[i] ,:]    
        #print ((dests.loc[dests.age_bucket == df_train.age[i], 'population_in_thousands']))
        population_in_thous.append(float((dests.loc[dests.age_bucket == df_train.age[i], 'population_in_thousands'])))
    #break
population_in_thous[0:10]

[-1, -1, 11264.0, 2458.8000000000002, -2, -2, 10659.0, 10659.0, 11413.0, -2]

In [31]:
df_tt_non_ohe = population_in_thous

In [32]:
#merging gender age bucket with train data
df_train['population_in_thousands'] = population_in_thous
df_test["population_in_thousands"] = -2 # NA

In [33]:
train_m = pd.merge(df_train, group_sessions, left_on='id', right_on ='id', how='left')
train_m = train_m.drop('id', 1)

In [34]:
#merging with grouped sessions and countries, **note most of training data is not in sessions. see below 

test_m = pd.merge(df_test, group_sessions, left_on='id', right_on ='id', how='left')
test_m = test_m.drop('id', 1)

#print (train_m.iloc[0:5, 0:10]) #to be deleted?
#print (train_m.iloc[0:5, 10:])  # to be deleted?

In [35]:
train_m.head()

Unnamed: 0,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,first_device_type,first_browser,country_destination,first_book_lag,account_active_lag,account_created_lag,bookings,population_in_thousands,sum_secs_elapsed,counts
0,2010-06-28,2009-03-19 04:32:55,NaT,NAN,0-4,facebook,0,en,direct,direct,...,Mac Desktop,Chrome,NDF,NB,NB,greater 1 day,NB,-1.0,,
1,2011-05-25,2009-05-23 17:48:09,NaT,MALE,35-39,facebook,0,en,seo,google,...,Mac Desktop,Chrome,NDF,NB,NB,greater 1 day,NB,-1.0,,
2,2010-09-28,2009-06-09 23:12:47,2010-08-02,FEMALE,55-59,basic,3,en,direct,direct,...,Windows Desktop,IE,US,before,greater 1 day,greater 1 day,waited,11264.0,,
3,2011-12-05,2009-10-31 06:01:29,2012-09-08,FEMALE,40-44,facebook,0,en,direct,direct,...,Mac Desktop,Firefox,other,greater 1 day,greater 1 day,greater 1 day,waited,2458.8,,
4,2010-09-14,2009-12-08 06:11:05,2010-02-18,NAN,40-44,basic,0,en,direct,direct,...,Mac Desktop,Chrome,US,before,greater 1 day,greater 1 day,waited,-2.0,,


In [36]:
toremove = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 
              'country_destination'] 
train_m.drop(toremove, axis=1, inplace=True)

In [37]:
toremove = ['date_account_created', 'timestamp_first_active', 'date_first_booking'] 
test_m.drop(toremove, axis=1, inplace=True)

In [38]:
train_m

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,first_book_lag,account_active_lag,account_created_lag,bookings,population_in_thousands,sum_secs_elapsed,counts
0,NAN,0-4,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NB,NB,greater 1 day,NB,-1.0,,
1,MALE,35-39,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NB,NB,greater 1 day,NB,-1.0,,
2,FEMALE,55-59,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,before,greater 1 day,greater 1 day,waited,11264.0,,
3,FEMALE,40-44,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,greater 1 day,greater 1 day,greater 1 day,waited,2458.8,,
4,NAN,40-44,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,before,greater 1 day,greater 1 day,waited,-2.0,,
5,NAN,0-4,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,greater 1 day,same day,before,early,-2.0,,
6,FEMALE,45-49,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited,10659.0,,
7,FEMALE,45-49,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited,10659.0,,
8,FEMALE,50-54,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited,11413.0,,
9,NAN,45-49,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,same day,before,before,early,-2.0,,


In [39]:
print(train_m.head())

   gender    age signup_method  signup_flow language affiliate_channel  \
0     NAN    0-4      facebook            0       en            direct   
1    MALE  35-39      facebook            0       en               seo   
2  FEMALE  55-59         basic            3       en            direct   
3  FEMALE  40-44      facebook            0       en            direct   
4     NAN  40-44         basic            0       en            direct   

  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0             direct               untracked        Web       Mac Desktop   
1             google               untracked        Web       Mac Desktop   
2             direct               untracked        Web   Windows Desktop   
3             direct               untracked        Web       Mac Desktop   
4             direct               untracked        Web       Mac Desktop   

  first_browser first_book_lag account_active_lag account_created_lag  \
0        Chrome    

In [40]:
print(test_m.head())

   gender    age signup_method  signup_flow language affiliate_channel  \
0  FEMALE  35-39      facebook            0       en            direct   
1     NAN     NA         basic            0       en            direct   
2     NAN     NA         basic            0       en            direct   
3     NAN     NA         basic            0       en            direct   
4     NAN     NA         basic            0       en            direct   

  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0             direct               untracked      Moweb            iPhone   
1             direct               untracked      Moweb            iPhone   
2             direct                  linked        Web   Windows Desktop   
3             direct                  linked        Web   Windows Desktop   
4             direct               untracked        Web       Mac Desktop   

   first_browser first_book_lag account_created_lag account_active_lag  \
0  Mobile Safari  

In [41]:
train_m.population_in_thousands.replace("NB", -1, inplace=True)
train_m.population_in_thousands.replace("NA", -2, inplace=True)
test_m.population_in_thousands.replace("NB", -1, inplace=True)
test_m.population_in_thousands.replace("NA", -2, inplace=True)

In [42]:
test_m["first_book_lag"] = "NA"
test_m["account_active_lag"] = "NA"

In [43]:
print(train_m.head())
train_m.first_affiliate_tracked = train_m.first_affiliate_tracked.astype(str)

   gender    age signup_method  signup_flow language affiliate_channel  \
0     NAN    0-4      facebook            0       en            direct   
1    MALE  35-39      facebook            0       en               seo   
2  FEMALE  55-59         basic            3       en            direct   
3  FEMALE  40-44      facebook            0       en            direct   
4     NAN  40-44         basic            0       en            direct   

  affiliate_provider first_affiliate_tracked signup_app first_device_type  \
0             direct               untracked        Web       Mac Desktop   
1             google               untracked        Web       Mac Desktop   
2             direct               untracked        Web   Windows Desktop   
3             direct               untracked        Web       Mac Desktop   
4             direct               untracked        Web       Mac Desktop   

  first_browser first_book_lag account_active_lag account_created_lag  \
0        Chrome    

In [44]:
train_m.sum_secs_elapsed.replace("nan", -2, inplace=True)
train_m.counts.replace("nan", -2, inplace=True)
test_m.sum_secs_elapsed.replace("nan", -2, inplace=True)
test_m.counts.replace("nan", -2, inplace=True)

In [45]:
test_m.first_affiliate_tracked = test_m.first_affiliate_tracked.astype(str)
test_m.gender = test_m.gender.astype(str)

In [46]:
le = LabelEncoder()
feature = le.fit_transform(test_m["gender"])
np.unique(feature)

array([0, 1, 2, 3])

In [47]:
# encode string input values as integers
features = []
for i in train_m:
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(train_m[i])
    features.append(feature)
encoded_x = np.array(features)

In [48]:
print(encoded_x.shape)

(18, 213451)


In [49]:
encoded_x = encoded_x.reshape(train_m.shape[0],train_m.shape[1])

In [50]:
#print(encoded_x[1:10, :])

In [51]:
# encode string input values as integers
test_features = []
for i in test_m:
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(test_m[i])
    test_features.append(feature)
encoded_test = np.array(test_features)

In [52]:
from sklearn.feature_extraction import DictVectorizer
#train_dict = train_m.T.to_dict()
#test_dict = test_m.T.to_dict()
encoded_test = encoded_test.reshape(test_m.shape[0], test_m.shape[1])
print(encoded_test.shape)

(62096, 18)


In [53]:
print(train_m.shape)
print(test_m.shape)

(213451, 18)
(62096, 18)


In [47]:
ohe_feats = ['gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'first_book_lag', 'account_active_lag', 'account_created_lag', 'bookings']
non_ohe_feats = ['population_in_thousands', 'sum_secs_elapsed', 'counts']
df_tt = train_m
df_tt_non_ohe = pd.DataFrame({'population_in_thousands' : [], 'sum_secs_elapsed' : [], 'counts': []})
df_tt_non_ohe['population_in_thousands'] = df_tt['population_in_thousands']
df_tt_non_ohe['sum_secs_elapsed'] = df_tt['sum_secs_elapsed']
df_tt_non_ohe['counts'] = df_tt['counts']
df_tt.drop(non_ohe_feats, axis=1, inplace=True)

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,first_book_lag,account_active_lag,account_created_lag,bookings
0,NAN,0-4,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NB,NB,greater 1 day,NB
1,MALE,35-39,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NB,NB,greater 1 day,NB
2,FEMALE,55-59,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,before,greater 1 day,greater 1 day,waited
3,FEMALE,40-44,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,greater 1 day,greater 1 day,greater 1 day,waited
4,NAN,40-44,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,before,greater 1 day,greater 1 day,waited
5,NAN,0-4,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,greater 1 day,same day,before,early
6,FEMALE,45-49,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited
7,FEMALE,45-49,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited
8,FEMALE,50-54,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,greater 1 day,greater 1 day,before,waited
9,NAN,45-49,basic,0,en,other,craigslist,omg,Web,Mac Desktop,Firefox,same day,before,before,early


In [48]:
#One-hot-encoding features
for f in ohe_feats:
    df_tt_dummy = pd.get_dummies(df_tt[f], prefix=f)
    df_tt = df_tt.drop([f], axis=1)
    df_tt = pd.concat((df_tt, df_tt_dummy), axis=1)   
    

In [50]:
df_tt = pd.concat((df_tt, df_tt_non_ohe), axis=1)
#df_tt = pd.merge(df_tt, df_tt_non_ohe, how='left')


In [56]:
#from sklearn.preprocessing import OneHotEncoder
# encode string input values as integers
#encoded_ohe_x = None
#for i in train_m:
#    label_encoder = LabelEncoder()
#    feature = label_encoder.fit_transform(train_m[i])
#    feature = feature.reshape(train_m.shape[0], 1)
#    onehot_encoder = OneHotEncoder(sparse=False)
#    feature = onehot_encoder.fit_transform(feature)
#    if encoded_ohe_x is None:
#        encoded_ohe_x = feature
#    else:
#        encoded_ohe_x = np.concatenate((encoded_ohe_x, feature), axis=1)
#print("X shape: : ", encoded_ohe_x.shape)
df_tt.drop(non_ohe_feats, axis=1, inplace=True)
df_tt.head()
#df_tt._features

ValueError: labels ['population_in_thousands' 'sum_secs_elapsed' 'counts'] not contained in axis

In [55]:
#encoded_ohe_x_test = None
#for i in test_m:
#    label_encoder = LabelEncoder()
#    feature = label_encoder.fit_transform(test_m[i])
#    feature = feature.reshape(test_m.shape[0], 1)
#    onehot_encoder = OneHotEncoder(sparse=False)
#    feature = onehot_encoder.fit_transform(feature)
#    if encoded_ohe_x_test is None:
#        encoded_ohe_x_test = feature
#    else:
#        encoded_ohe_x_test = np.concatenate((encoded_ohe_x_test, feature), axis=1)
#print("X shape: : ", encoded_ohe_x_test.shape)

In [256]:
#train_v = train_m.T.to_dict().values()
#test_v = test_m.T.to_dict().values()


In [257]:
#type(train_v)
#l = list(train_v)
#type(l)
#print(l[10])

In [198]:
#vec = DictVectorizer()
#train_v = vec.fit_transform(train_v)
#test_v = vec.fit_transform(test_v)

In [258]:
#print(train_v.shape)
#print(test_v.shape)
#print(test_v[1:10, :])

In [52]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(target)
train_y = le.transform(target)
print ((train_y))

[ 7  7 10 ...,  7  7  7]


In [57]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import numpy as np
params = {}
params["objective"] = "multi:softmax"
params["num_class"] = 12
params["eta"] = 0.005
params["min_child_weight"] = 6
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 1
params["silent"] = 1
params["max_depth"] = 6
params['eval_metric'] = "ndcg@5"
params['nthread'] = 4

clf = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, nthread=2, base_score=0.8)  

plst = list(params.items())

#dtrain = xgb.DMatrix(train_v, label=train_y)
#dtest = xgb.DMatrix(test_v)
#num_round = 5
#model = xgb.train(plst, dtrain, num_round)
clf.fit(df_tt, train_y)

XGBClassifier(base_score=0.8, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=25, nthread=2,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

In [58]:
print(clf.feature_importances_)

[ 0.02622348  0.0280608   0.02204777  0.00434274  0.01904126  0.00334057
  0.0081844   0.01620177  0.01653583  0.017538    0.01102388  0.0105228
  0.00567897  0.          0.01035577  0.0093536   0.00601303  0.00718223
  0.00567897  0.00334057  0.00016703  0.00133623  0.00150326  0.          0.023384
  0.01469851  0.00200434  0.0151996   0.00283949  0.01369634  0.01453148
  0.          0.          0.00501086  0.00901954  0.          0.00417571
  0.          0.          0.          0.00133623  0.00300651  0.00450977
  0.005846    0.          0.          0.00033406  0.00517789  0.
  0.00985469  0.00484383  0.          0.01002171  0.          0.          0.
  0.          0.00618006  0.          0.00300651  0.00300651  0.
  0.00016703  0.0023384   0.00100217  0.00066811  0.          0.00033406
  0.00267246  0.00334057  0.00851846  0.01653583  0.01703691  0.0081844
  0.01152497  0.01954234  0.00534491  0.          0.00517789  0.00851846
  0.          0.00835143  0.00100217  0.00851846  0.004

In [63]:
from sklearn.metrics import log_loss, accuracy_score
print(train_y)

[ 7  7 10 ...,  7  7  7]


In [64]:
train_y_pred = clf.predict(df_tt)
print(train_y_pred)
#score = accuracy_score(train_y, df_tt)

[ 7  7 10 ...,  7  7  7]


In [97]:
score

0.58353439431063803

In [104]:
#print(train_v.shape)

(213451, 171)


In [105]:
#print(test_v.shape)

(62096, 136)


In [65]:
#pred = model.predict(dtest)
pred = clf.predict(encoded_test)

NameError: name 'encoded_test' is not defined

In [63]:
#pred = map(int,pred)

In [70]:
pred = le.inverse_transform(pred)

In [71]:
print (set(pred))

{'US', 'NDF'}


In [74]:
pred

array(['NDF', 'NDF', 'NDF', ..., 'NDF', 'NDF', 'NDF'], dtype=object)

In [75]:
y_pred = clf.predict_proba(encoded_test)  

In [77]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [80]:
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)

In [67]:
clf.get_fscore()

AttributeError: 'XGBClassifier' object has no attribute 'get_fscore'