# Load

In [1]:
import pandas as pd
import numpy as np

In [2]:
import json

In [3]:
def read_tags(f):
    line = json.loads(f)
    tags = line["tags"]
    return (" ".join([tag.replace(" ", "_") for tag in tags]))

In [4]:
def read_id(f):
    line = json.loads(f)
    _id = line["_id"]
    return (_id)

In [5]:
%%time
with open("../data/train.json") as f:
    X_raw = np.array([read_tags(line) for line in f.readlines()])

CPU times: user 53.2 s, sys: 10.2 s, total: 1min 3s
Wall time: 1min 3s


In [6]:
Y = pd.read_csv("../data/train_target.csv").target.values

# CV

Хорошие результаты могут быть при min_df=6.309573444801939e-05

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000)

In [14]:
cv.fit(X_raw)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
cv.vocabulary_

{'логин': 6797,
 'login': 2353,
 'сообщества': 8861,
 'интернет': 6135,
 'сообщество': 8862,
 'социальные': 8885,
 'сети': 8688,
 'бренд': 5163,
 'карма': 6264,
 'голосование': 5589,
 'iphone': 2012,
 'cisco': 708,
 'apple': 313,
 'судебное_разбирательство': 9064,
 'афера': 4955,
 'обман': 7380,
 'мошенничество': 7156,
 'nokia': 2841,
 'meego': 2495,
 'raspberry_pi': 3384,
 'умный_дом': 9376,
 'ios': 1997,
 'co': 758,
 'uk': 4191,
 'filesystem': 1386,
 'утилизация': 9435,
 'реклама': 8386,
 'hp': 1807,
 'картриджи': 6272,
 'hdr': 1749,
 'amazon_web_services': 241,
 'облачные_вычисления': 7372,
 'информационная_безопасность': 6157,
 'gps_мониторинг': 1659,
 'репозиторий': 8415,
 'linux': 2319,
 'rhel': 3486,
 'debian': 978,
 'сайт': 8551,
 'продажи': 8070,
 'маркетинг': 6876,
 'аккумулятор': 4793,
 'батарея': 5009,
 'суд': 9063,
 'иск': 6176,
 'total_commander': 4103,
 'файловый_менеджер': 9461,
 'проводник': 8043,
 'клиентский_сервис': 6379,
 'конфликт': 6555,
 'студий': 9046,
 'ruward

In [16]:
# для собственных рассчетов
X_cv = cv.transform(X_raw)

In [32]:
%who

CountVectorizer	 LinearRegression	 X_raw	 X_test	 X_test_raw	 X_train	 X_train_raw	 Y	 Y_test	 
Y_train	 cv	 f	 json	 np	 pd	 pickle	 read_id	 read_tags	 
reg	 train_test_split	 


In [35]:
del X_train_raw

# Add feature

In [17]:
import pickle

In [18]:
X_img_train = pickle.load(open("../features/imgtagcount_train.pickle", "rb"))

In [21]:
X = np.concatenate([X_cv.toarray(), X_img_train], axis=1)

In [30]:
del X_cv
del X_img_train

# Split

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

In [28]:
del X

# Modeling

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
reg = LinearRegression(fit_intercept=True)

In [None]:
reg.fit(X_train, Y_train)

In [None]:
Y_prediction = reg.predict(X_test)

# Evaluate

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_error(Y_test, Y_prediction)

# Validation

In [17]:
%%time
with open("../data/test.json") as f:
    V_raw = np.array([read_tags(line) for line in f.readlines()])

CPU times: user 25.2 s, sys: 4.56 s, total: 29.7 s
Wall time: 36.3 s


In [78]:
%%time
V_result = pd.read_json("../data/test.json", lines=True)[["_id"]].rename(columns={"_id": "url"})

CPU times: user 15.6 s, sys: 2.99 s, total: 18.6 s
Wall time: 18.6 s


In [18]:
V = cv.transform(V_raw)

In [None]:
# Для передачи коллегам
pickle.dump(V, open("../features/tags_cv10k_test.pickle", "wb"))

In [80]:
V_result["target"] = reg.predict(V)

In [85]:
output_name = "../results/tags_cv10k_linreg.csv"

In [86]:
V_result.to_csv(output_name, index=False)

In [88]:
!head $output_name

url,target
https://geektimes.ru/post/87455/,2.768101213216765
https://geektimes.ru/post/87452/,3.3519307618257663
https://geektimes.ru/post/87459/,3.4621333717370235
https://habrahabr.ru/post/87461/,2.646380031783634
https://habrahabr.ru/post/5754/,1.790165538470128
https://geektimes.ru/post/87460/,2.4988841297807527
https://geektimes.ru/post/87462/,2.5311992680679003
https://habrahabr.ru/post/87467/,2.4729260789303855
https://habrahabr.ru/post/87464/,2.896418513861229
