In [3]:
import surprise
import numpy as np
import pandas as pd

from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNWithMeans
from surprise import SlopeOne
from surprise.model_selection import cross_validate
from tqdm import tqdm_notebook, tqdm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from surprise import accuracy
from surprise.model_selection import KFold
from multiprocessing import Pool
tqdm = tqdm_notebook

In [4]:
# this is hack, but who cares, use script as default folder
# otherwise the pickle doens't work
import sys
sys.path.insert(0, 'script')

import dataset as d
import models as m
import submission as s

In [None]:
# - try implicit library

In [None]:
# I have to convert the 
def deal_line(line):
    pos, rating = line.split(',')
    row, col = pos.split("_")
    row = row.replace("r", "")
    col = col.replace("c", "")
    return int(row), int(col), float(rating)

def read_txt(path):
    """read text file from path."""
    with open(path, "r") as f:
        return f.read().splitlines()
    
file_path = 'data/data_train.csv'
lines = read_txt(file_path)[1:]
data = [deal_line(line) for line in lines]

#shuffle(data)
#data = data[:100_000]

with open('data/kiru.csv', 'w') as f:
    for item in data:
        f.write("{},{},{}\n".format(item[0], item[1], item[2]))

# path to dataset file
file_path = 'data/kiru.csv'
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

In [None]:
def call_algo(i):
    trainset, testset = i
    # train and test algorithm.
    algo = SlopeOne()
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    return (algo, rmse)
        
def predict():
    with Pool(10) as p:
        # define a cross-validation iterator
        kf = KFold(n_splits=3)
        return p.map(call_algo, kf.split(data))

In [None]:
%%time
predictions = predict()
best_model_slope_one = min(predictions, key=lambda b: b[1])

In [None]:
def create_predictions(predictor):
    lines = read_txt('data/sample_submission.csv')[1:]
    data = [deal_line(line) for line in lines]

    predictions = []
    for i, each in tqdm(enumerate(data)):
        pred1 = predictor.predict(str(each[0]), str(each[1])).est
        predictions.append((each[2], int(round(pred1))))

    return predictions

In [None]:
predictions = create_predictions(best_model_slope_one[0])

In [None]:
from random import shuffle
predictor = best_model_slope_one[0]

def do_predict(predictor):
    lines = read_txt('data/data_train.csv')[1:]
    data = [deal_line(line) for line in lines]
    shuffle(data)

    predictions = []
    for i, each in tqdm(enumerate(data), total=len(data)):
        pred = predictor.predict(str(each[0]), str(each[1])).est
        real = each[2]
        # [predictions, real]
        predictions.append([int(round(pred)), int(real)])
        #if i == 10: break

    predictions = np.array(predictions)
    return predictions

In [None]:
predictions = do_predict(predictor)

In [None]:
# correct category 3
#    predictions
#    rating = 5

def plot(predictions):
    data = np.zeros((5, 5), dtype=int)
    for rating in range(1, 6):
        correct = len(predictions[predictions[:, 1] == rating])

        #np.where(predictions[:, 1] == predictions[:, 0])
        #print(correct)
        p = []
        for i in range(1, 6):
            x = len(predictions[(predictions[:, 1] == rating) & (predictions[:, 0] == i)])
            data[(rating - 1, i - 1)] = x

    data = np.nan_to_num(data/data.sum(axis=1, keepdims=True))

    x = pd.DataFrame(data=data, index=range(1, 6), columns=range(1, 6))
    x.plot.bar(stacked=True)
    
plot(predictions)

In [None]:
def call_algo_svd(i):
    trainset, testset = i
    print("Run one split")
    # train and test algorithm.
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    return (algo, rmse)
        
def predict_svd():
    with Pool(10) as p:
        # define a cross-validation iterator
        kf = KFold(n_splits=10)
        return p.map(call_algo_svd, kf.split(data))

In [None]:
%%time
predictions_svd = predict_svd()
best_model_svd = min(predictions_svd, key=lambda b: b[1])

In [None]:
x = do_predict(best_model_svd[0])

Lets train a linear regression to find the best coefficient

In [None]:
#plot(x)
#plot(predictions)
#plot(x)
error = np.sqrt(mean_squared_error(x[:, 0], x[:, 1]))
error

In [None]:
def combine(first, second):
    kf = KFold(n_splits=3)
    weights = []
    for trainset, testset in tqdm(kf.split(data), desc="split"):
        print("Do one split")
        input_for_linear = []
        for each in tqdm(trainset.all_ratings(), desc="Process rating of split"):
            pred1 = first.predict(str(each[0]), str(each[1])).est
            pred2 = second.predict(str(each[0]), str(each[1])).est
            output = [pred1, pred2]
            input_for_linear.append(output)
        
        y = list(map(lambda x: x[2], trainset.all_ratings()))
        eg = LinearRegression().fit(input_for_linear, y)
        
        predictions = []
        c = eg.coef_
        for i, b in enumerate(input_for_linear):
           # p = b[0] * c[0] + b[1] + c[1]
            p = b[0] * 0.5 + b[1] + 0.5
            predictions.append(p)
            
        error = np.sqrt(mean_squared_error(y, predictions))
        weights.append((error, c))
        
    return min(weights, key=lambda b: b[0])

In [None]:
best_weights = combine(best_model_slope_one[0], best_model_svd[0])

In [None]:
print(best_weights)

In [None]:
print(best_weights)

In [None]:
def create_submission_file(first, second, w1, w2):
    lines = read_txt('data/sample_submission.csv')[1:]
    data = [deal_line(line) for line in lines]

    predictions = []
    for each in tqdm(data):
        pred1 = first.predict(str(each[0]), str(each[1])).est
        pred2 = second.predict(str(each[0]), str(each[1])).est
        predictions.append((each[0], each[1], pred1 * w1 + pred2 * w2 ))

    with open('data/our_pred.csv', 'w') as f:
        f.write("Id,Prediction\n")
        for item in tqdm(predictions):
            f.write("r{}_c{},{}\n".format(item[0], item[1], int(round(item[2]))))

In [None]:
create_submission_file(avd, algo)

In [None]:
accuracy.rmse(algo, verbose=True)

### Blending

1. Get two separate models and try to combine them and see if the perform good

In [15]:
with Pool(12) as p:
    models = ["SurpriseSlopeOneModel", "SurpriseSlopeOneModel"]
    result = m.cross_validates_one_by_one(p, models, path="data/data_surprise.csv", splits=3)

CV:   0%|          | 0/3 [00:00<?, ?it/s]

running CV


CV: 100%|██████████| 3/3 [00:41<00:00, 19.89s/it]


Surprise SlopeOne 1.001538058747933
Surprise SlopeOne 1.0021896490287314
Surprise SlopeOne 1.001336472970429


In [6]:
from scipy.optimize import minimize

In [17]:
print(result[0][0].rmse)
print(result[1][0].rmse)

1.001538058747933
1.0021896490287314


In [90]:
data = d.to_surprise_read('data/data_surprise.csv')
data.split(10)

In [95]:
first = list(data.raw_folds())[0]
data_to_predict = first[1][:1000]

In [122]:
# Get all predictions
def get_predictions(models, data_to_predict): 
    result = []
    for each_data in tqdm(data_to_predict):
        predictions = []
        for each_model in models:
            p = each_model.algo.predict(each_data[0], each_data[1]).est
            predictions.append(p)
        result.append(predictions)
    return result

models = [result[0][0], result[1][0]]
model_predictions = get_predictions(models, data_to_predict)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

In [114]:
real = list(map(lambda x: x[2], data_to_predict))

In [123]:
w0 = [1 / len(models)] * len(models)

In [171]:
def calcualte_mean_square_error(weights, model_predictions):
    preds = []
    for i, pred in enumerate(model_predictions):
        mix_prediction = 0
        for i, w in enumerate(weights):
            mix_prediction += weights[i] * pred[i]
        preds.append(mix_prediction)
    preds = np.array(preds)
    preds = preds.clip(1, 5)

    mse = mean_squared_error(preds, real)
    return np.sqrt(mse)

In [173]:
result = minimize(fun=calcualte_mean_square_error, x0=w0,
                  args = (model_predictions),
                  options={'maxiter': 1000, 'disp': True})

Optimization terminated successfully.
         Current function value: 0.988783
         Iterations: 12
         Function evaluations: 60
         Gradient evaluations: 15


In [183]:
result.fun

0.9887825093824633

In [152]:
def predict_one(weights, models, one_data):
    print("ja")

In [163]:
all_predictions = get_predictions(models, data.raw_ratings)

HBox(children=(IntProgress(value=0, max=1176952), HTML(value='')))

In [170]:
predictions = all_predictions @ result.x

In [179]:
kf = KFold(n_splits=10)
for train, test in kf.split(data):
    print(test.)
    adsfas

[('5903', '794', 5.0), ('9088', '605', 5.0), ('8301', '311', 5.0), ('8353', '322', 5.0), ('2924', '768', 3.0), ('2001', '662', 5.0), ('7409', '235', 2.0), ('4781', '579', 4.0), ('6760', '418', 5.0), ('2182', '783', 3.0), ('7174', '326', 3.0), ('7302', '738', 4.0), ('8218', '278', 4.0), ('3650', '650', 5.0), ('8590', '739', 3.0), ('1666', '227', 3.0), ('7314', '310', 5.0), ('9089', '178', 5.0), ('6955', '599', 4.0), ('1382', '952', 3.0), ('8198', '650', 5.0), ('9830', '89', 3.0), ('3710', '786', 4.0), ('1760', '647', 4.0), ('2955', '533', 2.0), ('9778', '616', 5.0), ('517', '414', 5.0), ('5501', '367', 4.0), ('9847', '91', 3.0), ('709', '487', 4.0), ('8337', '213', 4.0), ('1239', '611', 5.0), ('414', '640', 5.0), ('959', '628', 5.0), ('3323', '486', 3.0), ('9314', '744', 3.0), ('3325', '171', 4.0), ('7024', '412', 5.0), ('8561', '759', 3.0), ('8510', '396', 4.0), ('724', '402', 4.0), ('6081', '367', 5.0), ('6700', '658', 5.0), ('7656', '898', 4.0), ('4481', '789', 3.0), ('9963', '109', 

NameError: name 'adsfas' is not defined