# Load

In [1]:
import pandas as pd
import numpy as np

In [2]:
import json

In [3]:
def read_tags(f):
    line = json.loads(f)
    tags = line["tags"]
    return (" ".join([tag.replace(" ", "_") for tag in tags]))

In [4]:
def read_id(f):
    line = json.loads(f)
    _id = line["_id"]
    return (_id)

In [5]:
%%time
with open("../data/train.json") as f:
    X_raw = np.array([read_tags(line) for line in f.readlines()])

CPU times: user 52.3 s, sys: 9.58 s, total: 1min 1s
Wall time: 1min 1s


In [6]:
Y = pd.read_csv("../data/train_target.csv").target.values

# Split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(X_raw, Y, test_size=0.1, random_state=0)

# CV

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
from sklearn.metrics import mean_absolute_error

In [None]:
def tag_cv(min_df):
    cv = CountVectorizer(min_df=min_df)
    cv.fit(X_train_raw)
    X_train = cv.transform(X_train_raw)
    X_test = cv.transform(X_test_raw)
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train)
    Y_prediction = reg.predict(X_test)
    Y_prediction
    return mean_absolute_error(Y_test, Y_prediction)

In [20]:
cv = CountVectorizer()
cv.fit(X_train_raw)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
X_train_cv = cv.transform(X_train_raw)

In [22]:
from sklearn.decomposition import TruncatedSVD

In [40]:
svd = TruncatedSVD(n_components=100, random_state=0, n_iter=20)

In [41]:
X_train = svd.fit_transform(X_train_cv)
X_test = svd.transform(cv.transform(X_test_raw))

In [42]:
reg = LinearRegression(fit_intercept=True)
reg.fit(X_train, Y_train)
Y_prediction = reg.predict(X_test)
Y_prediction

array([ 4.48159493,  2.68519513,  3.36714689, ...,  2.78388504,
        1.93646228,  2.79259573])

In [43]:
mean_absolute_error(Y_test, Y_prediction)

1.2754604391319777

In [44]:
from sklearn.ensemble import RandomForestRegressor

In [45]:
reg = RandomForestRegressor(n_estimators=10, max_depth=40, n_jobs=-1)
reg.fit(X_train, Y_train)
Y_prediction = reg.predict(X_test)
mean_absolute_error(Y_test, Y_prediction)

1.1297588488938077

# Validation

In [None]:
%%time
with open("../data/test.json") as f:
    V_raw = np.array([read_tags(line) for line in f.readlines()])

In [None]:
%%time
V_result = pd.read_json("../data/test.json", lines=True)[["_id"]].rename(columns={"_id": "url"})

In [None]:
V = cv.transform(V_raw)

In [None]:
V_result["target"] = reg.predict(V)

In [None]:
output_name = "../results/tags_cv10k_linreg.csv"

In [None]:
V_result.to_csv(output_name, index=False)

In [None]:
!head $output_name