In [74]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [75]:
urls_train_df = pd.read_csv('url_domain_train', header=None, delimiter='\t')
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df = urls_train_df[['id', 'url']]

In [76]:
urls_train_df.head()

Unnamed: 0,id,url
0,000000014B60815F65B38258011B6C01,login.rutracker.org
1,000000014B60815F65B38258011B6C01,rutracker.org
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net
3,000000014C03DA2A47AC433A0C755201,czinfo.ru
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru


In [77]:
urls_train_df = pd.DataFrame(urls_train_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_train_df['id'] = urls_train_df.index
urls_train_df.index = range(len(urls_train_df))
urls_train_df.columns = ['urls', 'id']

In [78]:
urls_train_df.head()

Unnamed: 0,urls,id
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01


In [79]:
age_train_df = pd.read_csv('age_profile_train', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']

In [80]:
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [81]:
train_df = urls_train_df.merge(age_train_df, how='left', on='id')

In [82]:
train_df.head()

Unnamed: 0,urls,id,age
0,"[id.rambler.ru, mail.rambler.ru, r0.ru]",000000013CB5719C0000A2C90002C101,53
1,"[1prime.ru, autorambler.ru, chellak.ru, docs.c...",00000001442BE24000001B7D00F50801,48
2,[bosch-korolev.ru],00000001448580F800003F1B31FB0901,28
3,"[aptekanizkihcen.ua, colady.ru, gorod.dp.ua, i...",0000000145BDB2FF000157971645E901,44
4,"[astrorok.ru, diets.ru, edaplus.info, eshzdoro...",000000014602771F0000DB9359714C01,48


In [83]:
train_df.describe()

Unnamed: 0,age
count,118603.0
mean,36.041331
std,12.58278
min,0.0
25%,28.0
50%,34.0
75%,43.0
max,99.0


In [84]:
topk = 300000
X_train, y_train = train_df.urls.values[:topk], train_df.age.values[:topk]
print(X_train[:3])

[['id.rambler.ru', 'mail.rambler.ru', 'r0.ru']
 ['1prime.ru', 'autorambler.ru', 'chellak.ru', 'docs.cntd.ru', 'echo.msk.ru', 'expert.ru', 'finance.rambler.ru', 'forbes.ru', 'forum.ixbt.com', 'garant.ru', 'govoritmoskva.ru', 'kommersant.ru', 'kp.ru', 'lenta.ru', 'mait.ru', 'metronews.ru', 'mk.ru', 'news.rambler.ru', 'news.smi2.ru', 'norm-load.ru', 'pfr.kirov.ru', 'pfrf.ru', 'photography-on-the.ru', 'realty.rambler.ru', 'ren.tv', 'riafan.ru', 'rns.online', 'rossbanki.ru', 'secretmag.ru', 'tehnorma.ru', 'tiu.ru', 'top68.ru', 'tvc.ru', 'tvzvezda.ru', 'vesti.ru', 'video.rambler.ru', 'weekend.rambler.ru']
 ['bosch-korolev.ru']]


In [85]:
X_train = map(lambda x: ' '.join(x), X_train)

In [86]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
X_train = vectorizer.fit_transform(X_train)

In [87]:
X_grid_train, X_grid_test, y_grid_train, y_grid_test = train_test_split(X_train, y_train,
                                                                        test_size=0.2, 
                                                                        random_state=0)

In [93]:
import xgboost as xgb

In [94]:
def rmse(x, y):
    return np.mean((x - y) ** 2) ** 0.5

In [95]:
def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_grid_train, label=y_grid_train)
    dvalid = xgb.DMatrix(X_grid_test, label=y_grid_test)
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
    score = rmse(y_grid_test, predictions)
    print("\nScore {0}\n\n".format(score))
    return {'rmse': score, 'status': STATUS_OK}

In [98]:
def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.quniform('max_depth', 1, 13, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'objective': 'multi:softprob',
             'nthread' : 6,
             'silent' : 1
             }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)
    print(best)

In [99]:
trials = Trials()
optimize(trials)

Training with params: 
{'subsample': 0.9, 'nthread': 6, 'colsample_bytree': 0.8500000000000001, 'silent': 1, 'min_child_weight': 3.0, 'gamma': 1.0, 'max_depth': 4.0, 'objective': 'multi:softprob', 'n_estimators': 103.0, 'eta': 0.30000000000000004}


XGBoostError: b'value 0for Parameter num_class should be greater equal to 1'

In [49]:
#xgb_reg = XGBRegressor(max_depth=10, learning_rate=0.05, n_estimators=120)
#xgb_reg.fit(X_train, y_train)
#xgb_score = cross_val_score(xgb_reg, X_train, y_train)
#print(xgb_score.mean())

In [50]:
#y_pred = xgb_reg.predict(X_train)
#print(rmse(y_train, y_pred))

In [51]:
#bag_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=12), n_estimators=20)
#bag_reg.fit(X_train, y_train)
#bag_score = cross_val_score(bag_reg, X_train, y_train)

In [52]:
#print(bag_score.mean())

In [53]:
#y_pred = bag_reg.predict(X_train)
#print(rmse(y_train, y_pred))

### Test

In [55]:
urls_test_df = pd.read_csv('url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df = urls_test_df[['id', 'url']]

In [56]:
urls_test_df = pd.DataFrame(urls_test_df.groupby('id')['url'].apply(lambda x: x.tolist()))
urls_test_df['id'] = urls_test_df.index
urls_test_df.index = range(len(urls_test_df))
urls_test_df.columns = ['urls', 'id']

In [57]:
urls_test_df.head()

Unnamed: 0,urls,id
0,"[1000bankov.ru, 1tv.ru, 4put.ru, argumenti.ru,...",000000014A02348E701552980349FF01
1,"[autorambler.ru, bilettorg.ru, dsol-druzhba.ru...",000000014A10EA183BF8594A0B2AB201
2,"[photosight.ru, rambler.ru]",000000014A4FE5C33A929D4C26943601
3,"[base.consultant.ru, dogovor-obrazets.ru, fd.r...",000000014B7BB9957784A9BC0AC9F401
4,"[assessor.ru, audit-it.ru, base.garant.ru, com...",000000014C7749F896D82C2B01E8B801


In [58]:
X_test = urls_test_df.urls.values
X_test = map(lambda x: ' '.join(x), X_test)
X_test = vectorizer.transform(X_test)

In [59]:
X_test

<19974x101690 sparse matrix of type '<class 'numpy.float64'>'
	with 797522 stored elements in Compressed Sparse Row format>

In [60]:
y_pred = xgb_reg.predict(X_test)

In [61]:
urls_test_df['age'] = y_pred
urls_test_df = urls_test_df[['id', 'age']]
urls_test_df.columns = ['Id', 'age']

In [62]:
urls_test_df.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,45.244942
1,000000014A10EA183BF8594A0B2AB201,38.613697
2,000000014A4FE5C33A929D4C26943601,35.868603
3,000000014B7BB9957784A9BC0AC9F401,35.095245
4,000000014C7749F896D82C2B01E8B801,37.647022


In [63]:
random_sol = pd.read_csv('sample_submission.csv')
miss_idx = set(random_sol.Id.values) - set(urls_test_df.Id.values)
miss_df = pd.DataFrame(list(zip(list(miss_idx), np.ones(len(miss_idx)))))
miss_df.columns = ['Id', 'age']

In [64]:
urls_test_df = urls_test_df.append(miss_df, ignore_index=True)

In [65]:
urls_test_df.to_csv('solution.csv', index=False)

In [66]:
!wc -l solution.csv

19980 solution.csv
