In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import math
from IPython.display import display
from sklearn import preprocessing
import scipy.sparse as sp
from sklearn.feature_extraction.text import *
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
%matplotlib inline

RAND = 228
MAX_N = 10**4
MAX_TEST = 1

In [2]:
test_cases = []
MEAN_SQRT = 0
for i in range(1, MAX_TEST + 1):
    line_to_floats = lambda x: \
        np.array(list(map(
            float, filter(
                lambda line: len(line) > 0, 
                x.split(' ')
            )
        )))
    
    inp = 'data/input00' + str(i) + '.txt'
    out = 'data/output00' + str(i) + '.txt'
    X = []
    y = []
    with open(inp, 'r') as file:
        for line in file:
            X.append(line_to_floats(line))
            
    with open(out, 'r') as file:
        for line in file:
            y.append(line_to_floats(line))
    y = y[1:]
    test_cases.append((np.array(X), np.array(y)))
    for p in y:
            MEAN_SQRT += p[0] * p[0] + p[1] * p[1]
print(MEAN_SQRT / MAX_TEST / MAX_N)

0.213046774231


In [3]:
def task_score(true, predict):
    serr = 0
    s = 0
    for t, p in zip(true, predict):
        er = t - p
        serr += er.dot(er)
        s += t.dot(t)
    return -10 * np.log10(serr / s) / 80 * 100

In [4]:
import random

def generate_seq(k, test=0):
    X_test, y_test = test_cases[test] #test_cases[random.randint(0, MAX_TEST - 1)]
    start = random.randint(0, MAX_N - 100)
    return X_test[start:start + k], y_test[start:start + k]
generate_seq(3)

(array([[ 3.88690436,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 3.98679275,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 3.70526474,  0.        , -0.        ,  0.        ,  0.        ]]),
 array([[-0.94828851,  0.        ],
        [-1.01183056,  0.        ],
        [-0.83274202,  0.        ]]))

In [5]:
def genearte_X_y(n, k):
    X, y = [], []
    for i in range(n):
        test, answer = generate_seq(k)
        X.append(test[:-1].ravel())
        y.append(answer[-1])
    return np.array(X), np.array(y)

In [6]:
X_data, y_data = genearte_X_y(1000, 20)
display(X_data.shape)

(1000, 95)

In [7]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

X = preprocessing.StandardScaler().fit_transform(X_data) 
X = PolynomialFeatures(degree=2).fit_transform(X)
y = y_data
display(X.shape)
display(X)

(1000, 4656)

array([[  1.00000000e+00,  -1.34030621e-05,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   6.07717042e-01],
       [  1.00000000e+00,   5.50761281e-01,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,   1.64550265e+00],
       [  1.00000000e+00,   4.18815897e-01,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   6.07717042e-01],
       ..., 
       [  1.00000000e+00,  -4.01396825e-01,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,   1.64550265e+00],
       [  1.00000000e+00,   1.81805097e-01,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,   1.64550265e+00],
       [  1.00000000e+00,   1.32729156e-02,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   6.07717042e-01]])

In [8]:
from sklearn.ensemble import RandomForestRegressor

kfl = KFold(random_state=RAND, n_splits=3, shuffle=True)

regr = RandomForestRegressor(
    n_estimators=100, 
    max_depth=3, 
    random_state=RAND, 
    max_features='sqrt'
)

In [9]:
cvs_regr = cross_val_score(regr, X, y, cv=kfl, scoring='neg_mean_squared_error')
display(np.mean(cvs_regr))

-0.051310200065848056

In [10]:
regr.fit(X, y)
result = regr.predict(X)
display(task_score(y, result))
display(list(zip(y, result))[:5])

4.8760816096008828

[(array([-0.5222737,  0.       ]), array([-0.43507628,  0.        ])),
 (array([ 0.25083757,  0.        ]), array([ 0.19156976,  0.        ])),
 (array([-0.52096755,  0.        ]), array([-0.17251018,  0.        ])),
 (array([-0.30198171,  0.        ]), array([-0.21426432,  0.        ])),
 (array([-0.16317917,  0.        ]), array([-0.22633944,  0.        ]))]

In [11]:
from sklearn.svm import SVR

class multi_model:
    def __init__(self, model, *args, **param):
        self.X_m = model(*args, **param)
        self.Y_m = model(*args, **param)
    def fit(self, X, y):
        y_y, y_x = zip(*y)
        self.X_m.fit(X, y_x)
        self.Y_m.fit(X, y_y)
    def predict(self, X):
        return list(zip(self.X_m.predict(X), self.Y_m.predict(X)))
    
svr = multi_model(SVR, kernel='poly')

In [12]:
svr.fit(X, y)
result = svr.predict(X)
display(task_score(y, result))
display(list(zip(y, result))[:5])

-1.4980400162997467

[(array([-0.5222737,  0.       ]), (0.0, -0.16280916868240636)),
 (array([ 0.25083757,  0.        ]), (0.0, 0.091801212550093159)),
 (array([-0.52096755,  0.        ]), (0.0, -0.44093349813621158)),
 (array([-0.30198171,  0.        ]), (0.0, 0.033294650199177198)),
 (array([-0.16317917,  0.        ]), (0.0, 0.018253419783391813))]