Machine Learning Challenge #2 from HackerEarth

Data: Kickstarter Project Details

Target: Project will successfully get funded or not

Hardware Configuration: 8-core CPU, 16gb RAM

https://www.hackerearth.com/challenge/competitive/machine-learning-challenge-2/problems/

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot

## 1. Data Exploration

In [2]:
train = pd.read_csv('../../input/train.csv')
test = pd.read_csv('../../input/test.csv')

In [3]:
print(train.shape, test.shape)

(108129, 14) (63465, 12)


Sample Label: 0 - 73568 , 1 - 34561


In [5]:
train[:2]

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0


## 2. Data Preprocessing
LabelEncoder and OneHotEncoder

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [7]:
#merge train and test data for preprocessing
size_train = train.shape[0]
target = train.final_status.values
train.drop(['backers_count', 'final_status'], axis=1, inplace=True)
data = train.append(test, ignore_index=True)

In [8]:
#project_id - to int
data['project_id'] = data['project_id'].apply(lambda x: x[4:]).apply(int)

In [9]:
#disable_communication - label encode
le = {}
le['disable_communication'] = LabelEncoder()
data['disable_communication'] = le['disable_communication'].fit_transform(data['disable_communication'])

In [10]:
#country, currency - label & one hot encode
ohe = {}
for col in ['country', 'currency']:    
    #fit encoder
    le[col] = LabelEncoder()
    ohe[col] = OneHotEncoder(sparse=False)
    
    #process train data
    data[col] = le[col].fit_transform(data[col])
    features = ohe[col].fit_transform(data[col].values.reshape(-1,1))
    print(data.shape, ' + ', features.shape)
    data = pd.concat([data, pd.DataFrame(features, columns=[col+'_'+str(n) for n in range(features.shape[1])])], axis=1)
print(data.shape)

(171594, 12)  +  (171594, 21)
(171594, 33)  +  (171594, 13)
(171594, 46)


## 3. Feature Extraction

### 3.1 Date Features

In [11]:
from datetime import datetime
for col in ['deadline', 'state_changed_at', 'created_at', 'launched_at']:
    dt = data[col].apply(datetime.fromtimestamp)
    data[col+'_year'] = dt.apply(lambda x: x.year)
    data[col+'_month'] = dt.apply(lambda x: x.month)
    data[col+'_day'] = dt.apply(lambda x: x.day)
    data[col+'_hour'] = dt.apply(lambda x: x.hour)
    data[col+'_minute'] = dt.apply(lambda x: x.minute)
    data[col+'_second'] = dt.apply(lambda x: x.second)

In [12]:
data['ready_duration'] = data['launched_at'] - data['created_at']

In [13]:
data['run_duration'] = data['state_changed_at'] - data['launched_at']

In [14]:
data['goal_duration'] = data['deadline'] - data['launched_at']

In [15]:
data['goal_daily'] = np.round(data['goal'] / np.round((data['run_duration']/(60*60*24)), decimals=1), decimals=0).astype(int)

### 3.2 Text Features

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
period = data['launched_at_year'].apply(str) + data['launched_at_month'].apply(lambda x: str(x).zfill(2))
period_vectorizer = CountVectorizer()
period_result = period_vectorizer.fit_transform(period)

period_value = {}
for value in period.unique():
    period_value[value] = (period == value).sum()
    
data['launched_at_ym_same'] = period.apply(lambda x: period_value[x])

In [18]:
period = data['deadline_year'].apply(str) + data['deadline_month'].apply(lambda x: str(x).zfill(2))
period_vectorizer = CountVectorizer()
period_result = period_vectorizer.fit_transform(period)

period_value = {}
for value in period.unique():
    period_value[value] = (period == value).sum()
    
data['deadline_ym_same'] = period.apply(lambda x: period_value[x])

In [19]:
#text features
data['keywords'] = data['keywords'].apply(str).apply(lambda x: x.split('-'))

In [20]:
for col in ['name', 'desc', 'keywords']:
    data[col+"_len"] = data[col].apply(str).apply(len)
    data[col+"_count"] = data[col].apply(str).apply(lambda x: len(x.split(' ')))

### 3.3 Dimentionality Reduction Features

In [21]:
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

In [22]:
n_comp = 30

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results = tsvd.fit_transform(data.drop(['name', 'desc', 'keywords'], axis=1))

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(data.drop(['name', 'desc', 'keywords'], axis=1))

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(data.drop(['name', 'desc', 'keywords'], axis=1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(data.drop(['name', 'desc', 'keywords'], axis=1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(data.drop(["name", 'desc', 'keywords'], axis=1))

# Append decomposition components to datasets
for i in range(n_comp):
    data['pca_' + str(i)] = pca_results[:, i]
    data['ica_' + str(i)] = ica_results[:, i]
    data['tsvd_' + str(i)] = tsvd_results[:, i]
    data['grp_' + str(i)] = grp_results[:, i]
    data['srp_' + str(i)] = srp_results[:, i]



### 3.4 Bag of words

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, SparsePCA
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

#### 3.4.1 Term Vectorizer

In [24]:
vectorizer = {}

In [25]:
keywords = data['keywords'].apply(lambda x: ' '.join(x))
vectorizer['keywords'] = TfidfVectorizer(stop_words='english')#, ngram_range=(1,3))
keywords_result = vectorizer['keywords'].fit_transform(keywords)

In [26]:
vectorizer['desc'] = TfidfVectorizer(stop_words='english')#, ngram_range=(1,3))
desc_result = vectorizer['desc'].fit_transform(data['desc'].fillna(''))

In [27]:
print(keywords_result.shape, desc_result.shape)

(171594, 93428) (171594, 98509)


#### 3.4.2 Topic Extraction

In [28]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

##### 3.4.2.1 KMeans Clustering

In [29]:
# Fit KMeans Cluster model - keywords, desc
cluster_svd = {
    'keywords': TruncatedSVD(n_components=200),
    'desc': TruncatedSVD(n_components=200) #2gb ram
}
cluster_svd_result = {
    'keywords': cluster_svd['keywords'].fit_transform(keywords_result),
    'desc': cluster_svd['desc'].fit_transform(desc_result)
}

In [30]:
cluster_norm = {
    'keywords': Normalizer(copy=False),
    'desc': Normalizer(copy=False)
}
cluster_norm_result = {
    'keywords': cluster_norm['keywords'].fit_transform(cluster_svd_result['keywords']),
    'desc': cluster_norm['desc'].fit_transform(cluster_svd_result['desc'])
}
cluster = {
    'keywords': KMeans(n_clusters=40, init='k-means++', max_iter=300, n_init=10,
                        verbose=1, n_jobs=-1),
    'desc': KMeans(n_clusters=40, init='k-means++', max_iter=300, n_init=10,
                    verbose=1, n_jobs=-1)
}

In [31]:
cluster_result = {}
cluster_result['keywords'] = cluster['keywords'].fit_transform(cluster_norm_result['keywords'])

Initialization complete
Initialization complete
Initialization complete
Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 144895.291649
start iteration
done sorting
Initialization complete
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 140490.497083
start iteration
done sorting
Iteration 0, inertia 145930.341304
start iteration
done sorting
end inner loop
Initialization complete
start iteration
done sorting
Iteration 1, inertia 141020.140312
start iteration
done sorting
end inner loop
end inner loop
start iteration
done sorting
end inner loop
end inner loop
Iteration 0, inertia 139480.111322
start iteration
done sorting
Iteration 1, inertia 141264.554307
start iteration
done sorting
end inner loop
Iteration 1, inertia 136286.352311
start iteration
done sorting
Iteration 0, inertia 140402.993538
start iteration
done sorting
start iteration


In [32]:
cluster_result['desc'] = cluster['desc'].fit_transform(cluster_norm_result['desc'])

Initialization complete
Initialization complete
Initialization complete
Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
Initialization complete
start iteration
done sorting
Iteration 0, inertia 143100.769886
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Iteration 0, inertia 142222.841472
start iteration
done sorting
Iteration 0, inertia 153516.310235
start iteration
done sorting
Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
Iteration 0, inertia 141191.757565
start iteration
done sorting
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
end inner loop
end inner loop
end inner loop
Iteration 0, inertia 141899.63999
start iteration
done sorting
Iteration 0, inertia 142520.989229
start iteration
done sorting
Iteration 1, inertia 138847.18686
start iteration
Iteration 1, inertia 139532.373779
start iteration
done sorting
do

In [None]:
print("Top terms per cluster:")

c = 'desc' #'keywords

original_space_centroids = cluster_svd[c].inverse_transform(cluster[c].cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]

terms = vectorizer[c].get_feature_names()
for i in range(40):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [33]:
#append to data
data = pd.concat([data, pd.DataFrame(cluster_result['keywords'], 
                                     columns=['cluster_k_'+str(n) for n in range(cluster_result['keywords'].shape[1])])], axis=1)
data = pd.concat([data, pd.DataFrame(cluster_result['desc'], 
                                     columns=['cluster_d_'+str(n) for n in range(cluster_result['desc'].shape[1])])], axis=1)

##### 3.4.2.2 NMF Decomposition

In [34]:
# Fit the NMF model
nmf = {}
nmf_result = {}

nmf['keywords'] = NMF(n_components=40, random_state=420,
                      alpha=.1, l1_ratio=.5, verbose=1)
nmf_result['keywords'] = nmf['keywords'].fit_transform(keywords_result)

nmf['desc'] = NMF(n_components=40, random_state=420,
                  alpha=.1, l1_ratio=.5, verbose=1)
nmf_result['desc'] = nmf['desc'].fit_transform(desc_result)

violation: 1.0
violation: 0.07628744502010772
violation: 0.02796792595897201
violation: 0.013853457007658403
violation: 0.011206797012528377
violation: 0.010296703939492006
violation: 0.009626229365646187
violation: 0.00882863719805236
violation: 0.008108145586475822
violation: 0.007606511395876673
violation: 0.007196238257106202
violation: 0.0068236315498397294
violation: 0.006548920393628163
violation: 0.006301471038003135
violation: 0.006080140494728286
violation: 0.005897026036727793
violation: 0.005727583463702171
violation: 0.0055613770356515305
violation: 0.0053993777309359215
violation: 0.005242587206809459
violation: 0.005087930250434997
violation: 0.004881903718001168
violation: 0.00470313699300628
violation: 0.004549671410547331
violation: 0.004409055309736572
violation: 0.00427253378497044
violation: 0.004139099397355373
violation: 0.004008690916147793
violation: 0.0038810074242864928
violation: 0.0037560289700128845
violation: 0.003634038282808164
violation: 0.003514974762

In [35]:
#print_top_words(nmf['keywords'], vectorizer['keywords'].get_feature_names(), 100)

In [36]:
#append to data
data = pd.concat([data, pd.DataFrame(nmf_result['keywords'], 
                                     columns=['nmf_k_'+str(n) for n in range(nmf_result['keywords'].shape[1])])], axis=1)
data = pd.concat([data, pd.DataFrame(nmf_result['desc'], 
                                     columns=['nmf_d_'+str(n) for n in range(nmf_result['desc'].shape[1])])], axis=1)

##### 3.4.2.3 LDA Decomposition

In [32]:
#discarded because not well performing
# Fit the LDA model (batch_size affects speed, use more data can allocate cpu efficiently)
#lda = {}
#lda_result = {}

'''lda['keywords'] = LatentDirichletAllocation(n_topics=40, max_iter=10, max_doc_update_iter=100,
                                            learning_method='online', batch_size=keywords_result.shape[0],
                                            random_state=420, n_jobs=-1, verbose=1)
lda_result['keywords'] = lda['keywords'].fit_transform(keywords_result)'''

'''lda['desc'] = LatentDirichletAllocation(n_topics=40, max_iter=10, max_doc_update_iter=100,
                                        learning_method='online', batch_size=desc_result.shape[0],
                                        learning_offset=50.,
                                        random_state=420, n_jobs=-1, verbose=1)
lda_result['desc'] = lda['desc'].fit_transform(desc_result)'''

"lda['desc'] = LatentDirichletAllocation(n_topics=40, max_iter=10, max_doc_update_iter=100,\n                                        learning_method='online', batch_size=desc_result.shape[0],\n                                        learning_offset=50.,\n                                        random_state=420, n_jobs=-1, verbose=1)\nlda_result['desc'] = lda['desc'].fit_transform(desc_result)"

In [33]:
#print_top_words(lda['desc'], vectorizer['desc'].get_feature_names(), 100)

## 4. Data Preparation - Memory Enhanced Concat

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171594 entries, 0 to 171593
Columns: 392 entries, project_id to nmf_d_39
dtypes: float64(345), int64(44), object(3)
memory usage: 513.2+ MB


In [38]:
#simple columns list
print(data.columns.values, data.columns.shape)

['project_id' 'name' 'desc' 'goal' 'keywords' 'disable_communication'
 'country' 'currency' 'deadline' 'state_changed_at' 'created_at'
 'launched_at' 'country_0' 'country_1' 'country_2' 'country_3' 'country_4'
 'country_5' 'country_6' 'country_7' 'country_8' 'country_9' 'country_10'
 'country_11' 'country_12' 'country_13' 'country_14' 'country_15'
 'country_16' 'country_17' 'country_18' 'country_19' 'country_20'
 'currency_0' 'currency_1' 'currency_2' 'currency_3' 'currency_4'
 'currency_5' 'currency_6' 'currency_7' 'currency_8' 'currency_9'
 'currency_10' 'currency_11' 'currency_12' 'deadline_year' 'deadline_month'
 'deadline_day' 'deadline_hour' 'deadline_minute' 'deadline_second'
 'state_changed_at_year' 'state_changed_at_month' 'state_changed_at_day'
 'state_changed_at_hour' 'state_changed_at_minute'
 'state_changed_at_second' 'created_at_year' 'created_at_month'
 'created_at_day' 'created_at_hour' 'created_at_minute' 'created_at_second'
 'launched_at_year' 'launched_at_month' 'lau

### 4.1 Without Bag-of-Words

In [39]:
#split train & testdata (if skip bag of words, for feature exploration)
data_train = data[:size_train].drop(['name', 'desc', 'keywords'], axis=1).values
data_test = data[size_train:].drop(['name', 'desc', 'keywords'], axis=1).values

### 4.2 With Bag-of-Words (Sparse)

In [40]:
#sparse matrix is GOD! use only 2gb vs > 30gb of dense array
from scipy import sparse

In [41]:
original = sparse.csr_matrix(data.drop(['name', 'desc', 'keywords'], axis=1).values)

In [42]:
concat = sparse.hstack([original, keywords_result, desc_result], format='csr')

In [43]:
#xgboost bug fix
data_final = sparse.hstack((concat, sparse.csr_matrix(np.ones((concat.shape[0], 1)))), format='csr')

In [44]:
#split train & testdata
data_train = data_final[:size_train]
data_test = data_final[size_train:]

In [45]:
print(data_train.shape, data_test.shape)

(108129, 192327) (63465, 192327)


### 4.3 Data Checkpoint

#### 4.3.1 To/From Without BoW

In [None]:
np.save('data_train_small', data_train)
np.save('data_test_small', data_test)

In [None]:
data_train = np.load('data_train_small')
data_test = np.load('data_test_small')

#### 4.3.2 To/From with Bow

In [None]:
#only in scipy 0.19.1
sparse.save_npz('data_train', data_train)
sparse.save_npz('data_test', data_test)

In [None]:
data_train = sparse.load_npz('data_train')
data_test = sparse.load_npz('data_test')

## 5. Model Building

### 5.1 XGBoost

In [45]:
import xgboost as xgb



#### 5.1.1 Model K-Fold Validation for initial exploration and performance checking

In [46]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=420)

models_xgb = {}
i = 0
for train_index, test_index in kf.split(data_train, target):
    X_train = data_train[train_index]
    X_val = data_train[test_index]
    Y_train = target[train_index]
    Y_val = target[test_index]
    
    models_xgb[i] = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=300,
                                      subsample=0.8, colsample_bytree=0.8, 
                                      seed=420)
    models_xgb[i].fit(X_train, Y_train, eval_metric='auc',
                  eval_set=[(X_train, Y_train), (X_val, Y_val)])
    i += 1

In [49]:
sorted(zip(data.columns.values, models_xgb[0].feature_importances_), key=lambda x: x[1], reverse=True)

[('ica_10', 0.012222222),
 ('srp_25', 0.011336146),
 ('srp_23', 0.010042194),
 ('launched_at_hour', 0.0099156117),
 ('tsvd_26', 0.0095780594),
 ('srp_22', 0.0093811536),
 ('srp_19', 0.0093108295),
 ('name', 0.0089170178),
 ('ready_duration', 0.008804501),
 ('tsvd_23', 0.008720113),
 ('tsvd_25', 0.0086919833),
 ('tsvd_14', 0.0086779185),
 ('tsvd_18', 0.0086075952),
 ('srp_27', 0.0086075952),
 ('tsvd_15', 0.0085935304),
 ('tsvd_24', 0.0085935304),
 ('srp_17', 0.0085513359),
 ('tsvd_28', 0.0084950775),
 ('srp_28', 0.0084950775),
 ('tsvd_11', 0.0084810127),
 ('srp_26', 0.0082841069),
 ('srp_24', 0.008270042),
 ('srp_14', 0.0082559772),
 ('tsvd_21', 0.0082278484),
 ('tsvd_22', 0.0081856539),
 ('tsvd_20', 0.0081012659),
 ('tsvd_10', 0.008087201),
 ('srp_18', 0.0080168778),
 ('srp_20', 0.0079887481),
 ('tsvd_16', 0.0079043601),
 ('keywords_count', 0.0078621656),
 ('grp_6', 0.0078340368),
 ('srp_9', 0.0078059072),
 ('tsvd_12', 0.0078059072),
 ('tsvd_17', 0.0077637131),
 ('srp_11', 0.0077215191

#### 5.1.2 Find Stopping Round with more data

In [47]:
#2. find stopping round
split_index = int(data_train.shape[0]*0.8)
X_train = data_train[:split_index]
X_val = data_train[split_index:]
Y_train = target[:split_index]
Y_val = target[split_index:]

In [93]:
model_xgb1 = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=20000,
                          subsample=0.8, colsample_bytree=0.9, reg_alpha=65,
                          seed=420)

In [94]:
model_xgb1.fit(X_train, Y_train, eval_metric='auc',
          eval_set=[(X_train, Y_train), (X_val, Y_val)],
          early_stopping_rounds=100)

[0]	validation_0-auc:0.679193	validation_1-auc:0.669174
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.687744	validation_1-auc:0.681675
[2]	validation_0-auc:0.696008	validation_1-auc:0.710305
[3]	validation_0-auc:0.695712	validation_1-auc:0.709799
[4]	validation_0-auc:0.698807	validation_1-auc:0.714664
[5]	validation_0-auc:0.704029	validation_1-auc:0.728124
[6]	validation_0-auc:0.706403	validation_1-auc:0.730623
[7]	validation_0-auc:0.708739	validation_1-auc:0.734506
[8]	validation_0-auc:0.710669	validation_1-auc:0.736391
[9]	validation_0-auc:0.711384	validation_1-auc:0.737306
[10]	validation_0-auc:0.71321	validation_1-auc:0.742082
[11]	validation_0-auc:0.713863	validation_1-auc:0.741598
[12]	validation_0-auc:0.71569	validation_1-auc:0.744803
[13]	validation_0-auc:0.716992	validation_1-auc:0.746422
[14]	validation_0-auc:0.718315	validation_1-auc:0.749596

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=20000, nthread=-1,
       objective='binary:logistic', reg_alpha=65, reg_lambda=1,
       scale_pos_weight=1, seed=420, silent=True, subsample=0.8)

#### 5.1.3 Final Xgboost Model on All Data

In [48]:
model_xgb2 = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1150,
                          subsample=0.8, colsample_bytree=0.9,
                          seed=420)

In [49]:
model_xgb2.fit(data_train, target, eval_metric='auc',
          eval_set=[(data_train, target)])

[0]	validation_0-auc:0.761052
[1]	validation_0-auc:0.784596
[2]	validation_0-auc:0.797302
[3]	validation_0-auc:0.805312
[4]	validation_0-auc:0.81006
[5]	validation_0-auc:0.814009
[6]	validation_0-auc:0.817859
[7]	validation_0-auc:0.821996
[8]	validation_0-auc:0.82519
[9]	validation_0-auc:0.827748
[10]	validation_0-auc:0.82982
[11]	validation_0-auc:0.83268
[12]	validation_0-auc:0.835574
[13]	validation_0-auc:0.838171
[14]	validation_0-auc:0.840371
[15]	validation_0-auc:0.84295
[16]	validation_0-auc:0.844947
[17]	validation_0-auc:0.846641
[18]	validation_0-auc:0.848531
[19]	validation_0-auc:0.851262
[20]	validation_0-auc:0.852353
[21]	validation_0-auc:0.854488
[22]	validation_0-auc:0.856136
[23]	validation_0-auc:0.857878
[24]	validation_0-auc:0.859199
[25]	validation_0-auc:0.860519
[26]	validation_0-auc:0.861373
[27]	validation_0-auc:0.862569
[28]	validation_0-auc:0.864747
[29]	validation_0-auc:0.866955
[30]	validation_0-auc:0.868501
[31]	validation_0-auc:0.869277
[32]	validation_0-auc:0

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=1150, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=420, silent=True, subsample=0.8)

### 5.2 LightGBM Model

In [55]:
import lightgbm as lgb

In [61]:
from sklearn.grid_search import GridSearchCV

grid_model = lgb.LGBMClassifier(reg_alpha=65, max_depth=10, learning_rate=0.1,
                                num_leaves=60, colsample_bytree=0.9, min_child_weight=3,
                                boosting_type='dart', max_bin=255, n_estimators=600,
                                subsample_for_bin=50000, objective=None, min_split_gain=0, 
                                min_child_samples=10, subsample=0.8, 
                                subsample_freq=1, reg_lambda=0, 
                                seed=420)
grid_params = {
    'max_depth':[4,6,8,10],
    'learning_rate':[0.1,0.06,0.03,0.01,0.005,0.001],
}
grid_cv = list(StratifiedKFold(n_splits=2, shuffle=True, random_state=420).split(data_train, target))

grid = GridSearchCV(grid_model, grid_params, scoring='roc_auc',
                    cv=grid_cv, verbose=50)
grid.fit(data_train, target)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] max_depth=4, learning_rate=0.1 ..................................
[CV] ......... max_depth=4, learning_rate=0.1, score=0.756491 -  32.3s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.3s remaining:    0.0s
[CV] max_depth=4, learning_rate=0.1 ..................................
[CV] ......... max_depth=4, learning_rate=0.1, score=0.761237 -  31.6s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s
[CV] max_depth=6, learning_rate=0.1 ..................................
[CV] ......... max_depth=6, learning_rate=0.1, score=0.761346 -  44.6s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s
[CV] max_depth=6, learning_rate=0.1 ..................................
[CV] ......... max_depth=6, learning_rate=0.1, score=0.765986 -  44.4s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s
[CV] max_depth=8, learning_rate=0.1 ...............

GridSearchCV(cv=[(array([     0,      8, ..., 108125, 108128]), array([     1,      2, ..., 108126, 108127])), (array([     1,      2, ..., 108126, 108127]), array([     0,      8, ..., 108125, 108128]))],
       error_score='raise',
       estimator=LGBMClassifier(boosting_type='dart', colsample_bytree=0.9, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=3, min_split_gain=0, n_estimators=600, nthread=-1,
        num_leaves=60, objective='binary', reg_alpha=65, reg_lambda=0,
        seed=420, silent=True, subsample=0.8, subsample_for_bin=50000,
        subsample_freq=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [4, 6, 8, 10], 'learning_rate': [0.1, 0.06, 0.03, 0.01, 0.005, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=50)

In [74]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 10}

In [79]:
model_lgb = lgb.LGBMClassifier(reg_alpha=65, max_depth=10, learning_rate=0.1,
                                num_leaves=60, colsample_bytree=0.9, min_child_weight=3,
                                boosting_type='dart', max_bin=255, n_estimators=600,
                                subsample_for_bin=50000, objective=None, min_split_gain=0, 
                                min_child_samples=10, subsample=0.8, 
                                subsample_freq=1, reg_lambda=0, 
                                seed=420)

In [80]:
model_lgb.fit(data_train, target, eval_metric='auc',
              eval_set=[(data_train, target)],
              early_stopping_rounds=100)
#[(X_train, Y_train), (X_val, Y_val)], [(data_train, target)],

[1]	training's auc: 0.699149
Train until valid scores didn't improve in 100 rounds.
[2]	training's auc: 0.712931
[3]	training's auc: 0.722328
[4]	training's auc: 0.727372
[5]	training's auc: 0.733656
[6]	training's auc: 0.73656
[7]	training's auc: 0.737055
[8]	training's auc: 0.737388
[9]	training's auc: 0.740978
[10]	training's auc: 0.741549
[11]	training's auc: 0.742252
[12]	training's auc: 0.744078
[13]	training's auc: 0.745626
[14]	training's auc: 0.747047
[15]	training's auc: 0.747577
[16]	training's auc: 0.748059
[17]	training's auc: 0.749382
[18]	training's auc: 0.750266
[19]	training's auc: 0.751704
[20]	training's auc: 0.751358
[21]	training's auc: 0.75266
[22]	training's auc: 0.753064
[23]	training's auc: 0.753975
[24]	training's auc: 0.75527
[25]	training's auc: 0.755236
[26]	training's auc: 0.75529
[27]	training's auc: 0.755338
[28]	training's auc: 0.755899
[29]	training's auc: 0.755986
[30]	training's auc: 0.756413
[31]	training's auc: 0.757978
[32]	training's auc: 0.75794

LGBMClassifier(boosting_type='dart', colsample_bytree=0.9, learning_rate=0.1,
        max_bin=255, max_depth=10, min_child_samples=10,
        min_child_weight=3, min_split_gain=0, n_estimators=600, nthread=-1,
        num_leaves=60, objective='binary', reg_alpha=65, reg_lambda=0,
        seed=420, silent=True, subsample=0.8, subsample_for_bin=50000,
        subsample_freq=1)

## 6. Make Prediction

### 6.1 Single Model

In [71]:
Y_pred = model_lgb.predict(data_test)

### 6.2 Ensemble Models

In [50]:
Y_pred1 = model_xgb2.predict_proba(data_test)
#Y_pred1 = pd.read_csv('pred_xgb.csv').values

In [51]:
output1 = pd.DataFrame(Y_pred1)
output1.to_csv('pred_xgb.csv', index=False)

In [55]:
#Y_pred2 = model_lgb.predict_proba(data_test)
Y_pred2 = pd.read_csv('pred_lgb.csv').values

In [101]:
output2 = pd.DataFrame(Y_pred2)
output2.to_csv('pred_lgb.csv', index=False)

In [56]:
Y_pred = np.apply_along_axis(lambda x: 0 if x[0]>0.5 else 1, 1, 
                             ((Y_pred1 + Y_pred2)/2))

### 6.3 Save Predictions

In [57]:
output = pd.DataFrame({'project_id': test['project_id'], 'final_status': Y_pred})
output.to_csv('submission26.csv', index=False, columns=['project_id', 'final_status'])

## ANN

In [14]:
#a = data_train[10000:11000].todense()
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, LSTM
from keras.layers.advanced_activations import PReLU

In [17]:
data_ann = np.random.random((1, 10000))#data_train[:100].todense()
target_ann = np.random.randint(2, size=(1, 1))

In [21]:
model = Sequential()

In [22]:
model.add(Embedding(10000, output_dim=256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [23]:
model.fit(data_ann, target_ann, epochs=1, verbose=2)

Epoch 1/1
21s - loss: 0.6918 - acc: 1.0000


<keras.callbacks.History at 0x7f35dd26acc0>

In [38]:
data_dim = 150000 #192327
timesteps = 8
num_classes = 2

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# Generate dummy training data
x_train = np.random.random((1000, timesteps, data_dim))
y_train = np.random.randint(num_classes, size=(1000, 1))

# Generate dummy validation data
x_val = np.random.random((100, timesteps, data_dim))
y_val = np.random.randint(num_classes, size=(100, 1))

model.fit(x_train, y_train,
          batch_size=64, epochs=5,
          validation_data=(x_val, y_val))

MemoryError: 