In [1]:
import pandas
def read_data(path, columns, used=None):
    df = pandas.read_csv(path, header=None, delimiter='\t')
    df.columns = columns
    if used:
        df = df[used]
    return df
def groupby(df, column):
    df = pandas.DataFrame(df.groupby('id')[column].apply(lambda x: x.tolist()))
    df['id'] = df.index
    df.index = range(len(df))
    return df


In [2]:
urls_train = groupby(read_data('url_domain_train', ['id', 'url', '_'], ['id', 'url']), 'url')
titles_train = groupby(read_data('title_unify_train', ['id', 'title', '_'], ['id', 'title']), 'title')
age_train = read_data('age_profile_train', ['id', 'age'])

In [3]:
def merge_by_id(lst, how='left'):
    result = lst[0]
    for df in lst[1:]:
        result = result.merge(df, on='id', how=how)
    return result
titles_train = merge_by_id([titles_train, age_train])
urls_train = merge_by_id([urls_train, age_train])

In [4]:
titles_train.head()

Unnamed: 0,title,id,age
0,[бесплатный надёжный почта рамблер электронный...,000000013CB5719C0000A2C90002C101,53
1,[24-х 34-х до договор неделя новость предложит...,00000001442BE24000001B7D00F50801,48
2,"[авто бош контакт королёв сервис, авто бош кор...",00000001448580F800003F1B31FB0901,28
3,[ua втрать війни донбасі за на новини озвучить...,0000000145BDB2FF000157971645E901,44
4,"[black walnut грецкий орех чёрный, inmoment ru...",000000014602771F0000DB9359714C01,48


# word2vec

In [5]:
from gensim.models import word2vec
from tqdm import tqdm
WVModel = word2vec.Word2Vec.load_word2vec_format('ruscorpora_russe.model.bin', binary=True)

In [6]:
import numpy as np
def WVgetVector(s):
    try:
        s = s.decode('utf-8')
        return WVModel[s]
    except KeyError:
        return None

def WVgetMean(title):
    sum_vec = np.zeros(300)
    cnt = 0
    for word in title.split():
        vec = WVgetVector(word)
        if vec is not None:
            sum_vec += vec
            cnt += 1
    return sum_vec if cnt == 0 else sum_vec / cnt

In [7]:
mean = np.frompyfunc(lambda title: WVgetMean(title), 1, 1)
titles = titles_train.title.values
titles_vectors = []
for i in tqdm(xrange(titles.shape[0])):
    titles_vectors.append(np.mean(mean(titles[i]), axis=0))

100%|██████████| 114156/114156 [02:45<00:00, 690.59it/s]


In [8]:
titles_train['title'] = titles_vectors

In [9]:
titles_train.head(10)

Unnamed: 0,title,id,age
0,"[-0.0423593497835, -0.0231449574931, -0.012383...",000000013CB5719C0000A2C90002C101,53
1,"[-0.00724858875954, -0.0113910311609, -0.00123...",00000001442BE24000001B7D00F50801,48
2,"[-0.0715595599419, -0.0380741873135, -0.011201...",00000001448580F800003F1B31FB0901,28
3,"[-0.00796721526093, -0.0126409986358, -0.00674...",0000000145BDB2FF000157971645E901,44
4,"[-0.0021002106406, -0.0144115539994, 0.0070514...",000000014602771F0000DB9359714C01,48
5,"[-0.0263486382858, -0.022821068325, 0.00790173...",0000000147B2D6F311DB5C4201B7FB01,36
6,"[-0.0138793526888, -0.0185326298431, 0.0177503...",0000000147C68954150168D701A8B801,33
7,"[-0.042946446157, -0.017138306066, 0.004210277...",0000000147EB76D738CD80750C879701,41
8,"[-0.0369529514108, -0.00294468662543, 0.021808...",00000001482AAFB69FA5228008AC2A01,51
9,"[-0.0230195817195, 0.0161863335253, 0.01337973...",0000000148390BB56A6B22BB178D3901,32


In [10]:
X_train_titles = np.asarray(titles_train.title.values.tolist())
y_train_titles = titles_train.age.values

# RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
def tryRandomForest():
    randomForest = RandomForestRegressor(n_estimators=5, n_jobs=8)
    return cross_val_score(randomForest, X_train_titles, y_train_titles, scoring='mean_squared_error', verbose=True)
print(tryRandomForest())

[-177.98247481 -177.16638336 -164.74162027]


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


In [57]:
randomForest = RandomForestRegressor(n_estimators=500, n_jobs=8, verbose=True)
randomForest.fit(X_train_titles, y_train_titles)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  3.6min
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:  5.1min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=8, oob_score=False, random_state=None,
           verbose=True, warm_start=False)

# TF-IDF

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

N_FEATURES = 900

to_string = lambda urls : ' '.join(map(lambda url: url.replace('.', ' '), urls))

urls_str = map(to_string, urls_train.url.values)
urls_hashed = HashingVectorizer(n_features=N_FEATURES).fit(urls_str).transform(urls_str).todense()

tfidf = TfidfTransformer()
X_train_urls, y_train_urls = tfidf.fit_transform(urls_hashed), urls_train.age.values

# LinearRegression

In [14]:
from sklearn.linear_model import LinearRegression
def tryLinearRegression():
    linearRegression = LinearRegression(n_jobs=8)
    return -cross_val_score(linearRegression, X_train_urls, y_train_urls, scoring='mean_squared_error', verbose=True)
tryLinearRegression()

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s finished


array([ 147.56601435,  147.02812789,  131.19296535])

In [15]:
linearRegression = LinearRegression(n_jobs=8)
linearRegression.fit(X_train_urls, y_train_urls)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=8, normalize=False)

# Stacking

In [58]:
titles_test = titles_train[100000:]
urls_test = urls_train[100000:]

In [59]:
stacking = merge_by_id([titles_test, urls_test[['id', 'url']]], how='inner')

In [60]:
stacking.head()

Unnamed: 0,title,id,age,url
0,"[-0.014372407844, -0.00740332908178, 0.0212872...",061C5B62540224B100000751708F1301,30,"[airlines-inform.ru, avto-blogger.ru, champion..."
1,"[-0.043120008601, -0.0097937361085, 0.01620212...",061C5C4256F7CB9D0000055C06ACD801,41,"[aif.ru, happymodern.ru, kirillovka.3dn.ru, ma..."
2,"[0.00490492552159, -0.00767485422944, -0.00654...",061C5CD9567BB3CF0000054E46EBFE01,32,"[63.mchs.gov.ru, assessor.ru, base.garant.ru, ..."
3,"[-0.0228744970202, -0.000863650096662, 0.00609...",061C5DDC56E680DF0000054BA6199601,43,"[cheaton.ru, mail.rambler.ru, news.smi2.ru, ph..."
4,"[-0.0273688068458, -0.0176658553532, 0.0096148...",061C5E3F548EC0C300017BDA5247F801,27,"[33devici.ru, belgorod.hh.ru, belgorod.superjo..."


In [61]:
def getTitlesPrediction():
    X_train_titles = np.asarray(stacking.title.values.tolist())
    return randomForest.predict(X_train_titles)

In [62]:
def getUrlsPrediction():
    urls_str = map(to_string, stacking.url.values)
    hashingVectorizer = HashingVectorizer(n_features=N_FEATURES).fit(urls_str)
    urls_hashed = hashingVectorizer.transform(urls_str).todense()
    X_test_urls = tfidf.transform(urls_hashed)
    return linearRegression.predict(X_test_urls)

In [63]:
stacking['age_t'] = getTitlesPrediction()
stacking['age_u'] = getUrlsPrediction()
stacking = stacking[['id', 'age_t', 'age_u', 'age']]
stacking.columns = ['id', 'age_t', 'age_u', 'age']
stacking.head()

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.1s finished


Unnamed: 0,id,age_t,age_u,age
0,061C5B62540224B100000751708F1301,31.8,32.904475,30
1,061C5C4256F7CB9D0000055C06ACD801,39.44,39.725408,41
2,061C5CD9567BB3CF0000054E46EBFE01,34.0,35.266306,32
3,061C5DDC56E680DF0000054BA6199601,37.94,42.309956,43
4,061C5E3F548EC0C300017BDA5247F801,28.34,29.724097,27


In [64]:
X_train_age, y_train_age = stacking[['age_t', 'age_u']].values, stacking['age'].values

In [65]:
stackingLR = LinearRegression(n_jobs=8)
stackingLR.fit(X_train_age, y_train_age)
print(-cross_val_score(stackingLR, X_train_age, y_train_age, scoring='mean_squared_error', verbose=True))



[ 14.09757125  12.02868     15.33608585]


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


# Prediction on test data

In [149]:
def predictionOnTest():
    global test, urls_test, titles_test
    urls_test = groupby(read_data('url_domain_test', ['id', 'url', '_'], ['id', 'url']), 'url')
    titles_test = groupby(read_data('title_unify_test', ['id', 'title', '_'], ['id', 'title']), 'title')
    test = merge_by_id([titles_test, urls_test], how='inner')
    print(len(test))
    test_titles = test.title.values
    titles_vectors = []
    for i in tqdm(xrange(test_titles.shape[0])):
        titles_vectors.append(np.mean(mean(test_titles[i]), axis=0))
    y_pred_titles = randomForest.predict(titles_vectors)
    urls_str = map(to_string, test.url.values)
    urls_transformed = tfidf.transform(HashingVectorizer(n_features=N_FEATURES).fit(urls_str).transform(urls_str).todense())
    y_pred_urls = linearRegression.predict(urls_transformed)
    stacked_pred = np.hstack([y_pred_titles[:, None], y_pred_urls[:, None]])
    return stackingLR.predict(stacked_pred)


In [150]:
y_pred = predictionOnTest()

  0%|          | 30/19955 [00:00<01:15, 262.37it/s]

19955


100%|██████████| 19955/19955 [01:05<00:00, 306.99it/s]
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.1s finished


In [151]:
print(len(test), len(y_pred))

(19955, 19955)


In [152]:
test['age'] = y_pred
test = test[['id', 'age']]
test.columns = ['Id', 'age']


In [153]:
avg = np.mean(y_train_age)
print(avg)

34.0209142938


In [154]:
def getMissTitles():
    miss_title_ids = set(titles_test.id.values) - set(test.Id.values)
    miss_titles = titles_test[titles_test['id'].isin(miss_title_ids)].copy()
    miss_titles['age'] = avg
    miss_titles = miss_titles[['id', 'age']]
    miss_titles.columns = ['Id', 'age']
    return miss_titles

In [155]:
def getMissUrls():
    miss_url_ids = set(urls_test.id.values) - set(test.Id.values)
    miss_urls = urls_test[urls_test['id'].isin(miss_url_ids)].copy()
    miss_urls['age'] = avg
    miss_urls = miss_urls[['id', 'age']]
    miss_urls.columns = ['Id', 'age']
    return miss_urls

In [156]:
missTitles = getMissTitles()
missUrls = getMissUrls()

In [157]:
print(len(missTitles))

5


In [158]:
final_test = test.append(missTitles, ignore_index=True)
final_test = final_test.append(missUrls, ignore_index=True)
final_test.to_csv('answer.csv', index=False)