In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import scipy

%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
train_df = pd.read_csv('~/Downloads/habr_popularity_data/howpop_train.csv')
test_df  = pd.read_csv('~/Downloads/habr_popularity_data/howpop_test.csv')

In [3]:
train_df.head(1).T

Unnamed: 0,0
url,https://habrahabr.ru/post/18284/
domain,habrahabr.ru
post_id,18284
published,2008-01-01 18:19:00
author,@Tapac
flow,develop
polling,False
content_len,4305
title,Новогодний подарок блоггерам — WordPress 2.3.2
comments,0


In [4]:
train_df.describe(include='all')

Unnamed: 0,url,domain,post_id,published,author,flow,polling,content_len,title,comments,favs,views,votes_plus,votes_minus,views_lognorm,favs_lognorm,comments_lognorm
count,134137,134137,134137.0,134137,97657,97048,134137,134137.0,134137,134137.0,134137.0,134137.0,133566.0,133566.0,134137.0,134137.0,134137.0
unique,134137,2,,130291,22077,6,2,,133628,,,,,,,,
top,https://habrahabr.ru/company/symantec/blog/122...,habrahabr.ru,,2011-06-14 15:52:00,@alizar,develop,False,,"Интересные события, произошедшие в выходные",,,,,,,,
freq,1,97048,,39,5292,53318,129412,,56,,,,,,,,
mean,,,181307.054265,,,,,7516.929699,,39.625994,71.999866,16631.01,35.536888,8.050035,0.004849,0.003402,-0.003675
std,,,81766.350702,,,,,8724.77164,,62.398958,145.854135,31479.82,42.461073,11.398282,1.002158,1.002778,1.005259
min,,,18284.0,,,,,1.0,,0.0,0.0,3.0,0.0,0.0,-6.615254,-4.137662,-4.161967
25%,,,115565.0,,,,,2179.0,,7.0,8.0,2100.0,11.0,2.0,-0.706552,-0.69578,-0.633775
50%,,,182666.0,,,,,4949.0,,19.0,27.0,7600.0,22.0,5.0,-0.020934,0.007022,0.072082
75%,,,257401.0,,,,,9858.0,,48.0,78.0,18700.0,45.0,10.0,0.682689,0.708778,0.697072


In [26]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
url,https://habrahabr.ru/post/18284/,https://habrahabr.ru/post/18285/,https://habrahabr.ru/post/18286/,https://habrahabr.ru/post/18291/,https://geektimes.ru/post/18294/
domain,habrahabr.ru,habrahabr.ru,habrahabr.ru,habrahabr.ru,geektimes.ru
post_id,18284,18285,18286,18291,18294
published,2008-01-01 18:19:00,2008-01-01 18:30:00,2008-01-01 18:34:00,2008-01-02 01:32:00,2008-01-02 14:34:00
author,@Tapac,@DezmASter,@DezmASter,@Taoorus,@dennydo
flow,develop,design,design,design,
polling,False,False,False,False,False
content_len,4305,7344,8431,5662,3706
title,Новогодний подарок блоггерам — WordPress 2.3.2,"Сумасшедшие яйца, или сервис для отслеживания ...","Сумасшедшие яйца, или сервис для отслеживания ...","Сглаживание шрифтов, и субпиксельная отрисовка",Почему мне не нравится iPhone
comments,0,1,47,102,230


In [6]:
train_df.flow = train_df.flow.astype('string')

In [7]:
train_df.domain = train_df.domain.astype('string')

In [8]:
train_df.url = train_df.url.astype('string')

In [10]:
train_df.title = train_df.title.astype('string')

In [11]:
train_df.author = train_df.author.astype('string')

In [13]:
train_df.published = pd.to_datetime(train_df.published)

In [14]:
train_df.dtypes

url                         string
domain                      string
post_id                      int64
published           datetime64[ns]
author                      string
flow                        string
polling                       bool
content_len                  int64
title                       string
comments                     int64
favs                         int64
views                        int64
votes_plus                 float64
votes_minus                float64
views_lognorm              float64
favs_lognorm               float64
comments_lognorm           float64
dtype: object

In [15]:
train_df.shape, test_df.shape

((134137, 17), (3990, 9))

In [23]:
train_df['year'] = train_df.published.dt.year

In [24]:
train_df['month'] = train_df.published.dt.month

In [25]:
train_df['day'] = train_df.published.dt.day

In [27]:
train_df['hour'] = train_df.published.dt.hour

In [28]:
train_df['minutes'] = train_df.published.dt.minute

In [43]:
train_df.groupby(by='year').size().sort_values()

year
2008     7743
2009    10783
2010    13091
2011    15063
2013    15537
2012    15972
2014    16180
2016    16316
2015    23452
dtype: int64

In [44]:
features = ['author', 'flow', 'domain','title']
train_size = int(0.7 * train_df.shape[0])

In [45]:
len(train_df), train_size

(134137, 93895)

In [46]:
X, y = train_df.loc[:, features],  train_df['favs_lognorm']

In [47]:
X_test = test_df.loc[:, features]

In [48]:
X_train, X_valid = X.iloc[:train_size, :], X.iloc[train_size:,:]

y_train, y_valid = y.iloc[:train_size], y.iloc[train_size:]

In [53]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.3, ngram_range=(1,3))

In [54]:
X_train_title = tfidf.fit_transform(X_train['title'])

In [57]:
tfidf.vocabulary_[]

{'новогодний': 31497,
 'подарок': 36257,
 'блоггерам': 14241,
 'wordpress': 12172,
 'новогодний подарок': 31499,
 'сумасшедшие': 45476,
 'яйца': 50489,
 'или': 22661,
 'сервис': 42745,
 'для': 18811,
 'отслеживания': 34225,
 'посетителей': 37373,
 'или сервис': 22833,
 'сервис для': 42751,
 'для отслеживания': 19413,
 'или сервис для': 22834,
 'сервис для отслеживания': 42753,
 'сглаживание': 42460,
 'шрифтов': 49862,
 'отрисовка': 34214,
 'сглаживание шрифтов': 42461,
 'почему': 37632,
 'мне': 28231,
 'не': 30755,
 'нравится': 31925,
 'iphone': 5951,
 'почему мне': 37659,
 'мне не': 28233,
 'не нравится': 30851,
 'всеобщая': 16456,
 'доступность': 20285,
 'действии': 18272,
 'чем': 49165,
 'суть': 45504,
 'проекта': 39310,
 'ru': 9541,
 'search': 9842,
 'запускается': 21584,
 'понедельник': 37201,
 'mobile': 7315,
 'выбор': 16759,
 'способа': 44575,
 'связи': 42437,
 'обзор': 32095,
 'под': 36147,
 'windows': 11973,
 'под windows': 36185,
 'онлайн': 33063,
 'банкинг': 13661,
 'оформле

In [58]:
X_valid_title = tfidf.transform(X_valid['title'])

In [59]:
X_test_title = tfidf.transform(X_test['title'])

In [104]:
vectorizer_title_ch = TfidfVectorizer(analyzer='char', min_df=3, max_df=0.3, ngram_range=(1,3))

In [105]:
X_train_title_ch = vectorizer_title_ch.fit_transform(X_train['title'])
X_valid_title_ch = vectorizer_title_ch.transform(X_valid['title'])
X_test_title_ch = vectorizer_title_ch.transform(X_test['title'])

In [61]:
feats = ['author', 'flow', 'domain']

In [62]:
X_train[feats][:5]

Unnamed: 0,author,flow,domain
0,@Tapac,develop,habrahabr.ru
1,@DezmASter,design,habrahabr.ru
2,@DezmASter,design,habrahabr.ru
3,@Taoorus,design,habrahabr.ru
4,@dennydo,,geektimes.ru


In [63]:
X_train[feats][:5].fillna('-')

Unnamed: 0,author,flow,domain
0,@Tapac,develop,habrahabr.ru
1,@DezmASter,design,habrahabr.ru
2,@DezmASter,design,habrahabr.ru
3,@Taoorus,design,habrahabr.ru
4,@dennydo,-,geektimes.ru


In [64]:
X_train[feats][:5].fillna('-').T.to_dict()

{0: {'author': '@Tapac', 'flow': 'develop', 'domain': 'habrahabr.ru'},
 1: {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'},
 2: {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'},
 3: {'author': '@Taoorus', 'flow': 'design', 'domain': 'habrahabr.ru'},
 4: {'author': '@dennydo', 'flow': '-', 'domain': 'geektimes.ru'}}

In [65]:
X_train[feats][:5].fillna('-').T.to_dict().values()

dict_values([{'author': '@Tapac', 'flow': 'develop', 'domain': 'habrahabr.ru'}, {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@DezmASter', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@Taoorus', 'flow': 'design', 'domain': 'habrahabr.ru'}, {'author': '@dennydo', 'flow': '-', 'domain': 'geektimes.ru'}])

In [66]:
dict_vect = DictVectorizer()

In [67]:
dict_vect_matrix = dict_vect.fit_transform(X_train[feats][:5].fillna('-').T.to_dict().values())
dict_vect_matrix

<5x9 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [68]:
dict_vect_matrix.toarray()

array([[0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 1., 1., 0., 1., 0., 0.]])

In [69]:
dict_vect_matrix.shape

(5, 9)

In [70]:
for col in feats:
    print(col,len(X_train[col][:5].fillna('-').unique()))

author 4
flow 3
domain 2


In [71]:
dict_vect.feature_names_

['author=@DezmASter',
 'author=@Taoorus',
 'author=@Tapac',
 'author=@dennydo',
 'domain=geektimes.ru',
 'domain=habrahabr.ru',
 'flow=-',
 'flow=design',
 'flow=develop']

In [106]:
dict_vect = DictVectorizer()

In [107]:
vectorizer_feats = dict_vect.fit_transform(X_train[feats].fillna('-').T.to_dict().values())

In [108]:
vectorizer_feats.shape

(93895, 17869)

In [110]:
X_train_feats = vectorizer_feats
X_valid_feats = dict_vect.transform(X_valid[feats].fillna('-').T.to_dict().values())

In [111]:
X_test_feats = dict_vect.transform(X_test[feats].fillna('-').T.to_dict().values())

In [112]:
X_test_title.shape, X_test_feats.shape, X_test_title_ch.shape

((3990, 50624), (3990, 17869), (3990, 32839))

In [113]:
X_train_title.shape, X_train_feats.shape, X_train_title_ch.shape

((93895, 50624), (93895, 17869), (93895, 32839))

In [114]:
X_valid_title.shape, X_valid_feats.shape, X_valid_title_ch.shape

((40242, 50624), (40242, 17869), (40242, 32839))

In [115]:
X_train_new = scipy.sparse.hstack([X_train_title, X_train_feats, X_train_title_ch])
X_valid_new = scipy.sparse.hstack([X_valid_title, X_valid_feats, X_valid_title_ch])
X_test_new =  scipy.sparse.hstack([X_test_title, X_test_feats, X_test_title_ch])

In [116]:
X_train_new.shape, X_valid_new.shape, X_test_new.shape, 

((93895, 101332), (40242, 101332), (3990, 101332))

In [117]:
from sklearn.linear_model import Ridge

In [121]:
model = Ridge(alpha=1.0, random_state=42)

In [122]:
%time model.fit(X_train_new, y_train)

CPU times: user 35.2 s, sys: 1.13 s, total: 36.3 s
Wall time: 9.76 s


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [123]:
train_preds1 = model2.predict(X_train_new)
valid_preds1 = model2.predict(X_valid_new)

print('Ошибка на трейне',mean_squared_error(y_train, train_preds1))
print('Ошибка на тесте',mean_squared_error(y_valid, valid_preds1))

Ошибка на трейне 0.23647684453007084
Ошибка на тесте 0.6920209908459402


In [161]:
train_df.columns

Index(['url', 'domain', 'post_id', 'published', 'author', 'flow', 'polling',
       'content_len', 'title', 'comments', 'favs', 'views', 'votes_plus',
       'votes_minus', 'views_lognorm', 'favs_lognorm', 'comments_lognorm',
       'year', 'month', 'day', 'hour', 'minutes'],
      dtype='object')

In [176]:
X_train_full = scipy.sparse.vstack([X_train_new, X_valid_new])

In [191]:
ext_columns = ['content_len']

In [192]:
X_train_ext = scipy.sparse.hstack([X_train_full, train_df.loc[:,ext_columns]])

In [195]:
test_df.dtypes

url            object
domain         object
post_id         int64
published      object
author         object
flow           object
polling          bool
content_len     int64
title          object
dtype: object

In [196]:
X_test_ext = scipy.sparse.hstack([X_test_new, test_df.loc[:,ext_columns]])

In [198]:
X_train_full.shape

(134137, 101332)

In [199]:
X_train_ext.shape

(134137, 101333)

In [200]:
model = Ridge(random_state=42)
%time model.fit(X_train_ext, y)

CPU times: user 1.67 s, sys: 194 ms, total: 1.86 s
Wall time: 1.23 s


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [201]:
train_preds = model.predict(X_train_ext)

print('Ошибка на трейне',mean_squared_error(y, train_preds))

Ошибка на трейне 0.8777714717647073


In [203]:
test_preds = model.predict(X_test_ext)

In [206]:
submission = pd.DataFrame()

In [207]:
submission['url'] = test_df['url']

In [208]:
submission['favs_lognorm'] = test_preds

In [209]:
len(submission)

3990

In [210]:
submission.to_csv('/tmp/submission.csv', index=False)

In [211]:
! head '/tmp/submission.csv'

url,favs_lognorm
https://habrahabr.ru/post/314080/,0.5202672671131797
https://habrahabr.ru/company/plesk/blog/313732/,0.20302079116290517
https://habrahabr.ru/company/etagi/blog/314000/,2.6797803574344847
https://habrahabr.ru/company/knopka/blog/314030/,0.9070432843874726
https://geektimes.ru/company/audiomania/blog/282058/,0.09047060895642084
https://habrahabr.ru/company/pechkin/blog/314066/,0.2906599758284445
https://habrahabr.ru/company/first/blog/314106/,0.43123503106541095
https://habrahabr.ru/company/vasexperts/blog/313558/,0.11419341778868197
https://habrahabr.ru/post/314108/,0.20076733049369289


In [212]:
! kaggle competitions submit -c howpop-habrahabr-favs-lognorm -f /tmp/submission.csv -m next

100%|████████████████████████████████████████| 237k/237k [00:02<00:00, 91.0kB/s]
Successfully submitted to Прогноз популярности статьи на Хабре (old)