# MoodStream: Эксперименты 

Основная задача заключается в подборе модели с удовлетворительным процентом ошибок и приемлемым временем выполнения.

---

## Импорты

In [1]:
import pandas as pd
import numpy as np
import requests, zipfile, io
from sklearn.metrics.pairwise import cosine_similarity
import dotenv

from clearml import Dataset, Task, Logger


## ENV

In [2]:
# В корне необходим файл .env, устанавливающий значения для переменных CLEARML_API_ACCESS_KEY и CLEARML_API_SECRET_KEY
%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
    
%load_ext dotenv
%dotenv

RANDOM_STATE=43

env: CLEARML_WEB_HOST=https://app.clear.ml
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml


## Данные

### Фильмы

In [3]:
movies_dataset = Dataset.get(
    dataset_id='d83616f806ab4067aa7d79acb9e16f21',  
    only_completed=True, 
    auto_create=False
)
movies_dataset_local_path = movies_dataset.get_local_copy()
movies_data_df = pd.read_csv(f'{movies_dataset_local_path}/movies.csv', sep='\t')
movies_ratings_df = pd.read_csv(f'{movies_dataset_local_path}/ratings.csv', sep='\t')
movies_genres_df = pd.read_csv(f'{movies_dataset_local_path}/genres.csv', sep='\t')

In [4]:
def get_formatted_movies_ratings():
    # форматирую датафрейм: последовательные идентификаторы для пользователей
    movies_ratings_df_new = movies_ratings_df.copy()
    movies_ratings_df_new_users = movies_ratings_df_new['user_id'].unique()
    movies_ratings_df_new_users = pd.DataFrame(movies_ratings_df_new_users, columns=['user_id'])
    movies_ratings_df_new_users['new_user_id'] = movies_ratings_df_new_users.apply(lambda x: x.index)
    movies_ratings_df_new = movies_ratings_df_new.merge(movies_ratings_df_new_users, on='user_id', how='left')

    # форматирую датафрейм: последовательные идентификаторы для фильмов
    movies_ratings_df_new_movies = movies_ratings_df_new['movie_id'].unique()
    movies_ratings_df_new_movies = pd.DataFrame(movies_ratings_df_new_movies, columns=['movie_id'])
    movies_ratings_df_new_movies['item_id'] = movies_ratings_df_new_movies.apply(lambda x: x.index)
    movies_ratings_df_new = movies_ratings_df_new.merge(movies_ratings_df_new_movies, on='movie_id', how='left')

    # форматирую датафрейм: выставляю оценку пользователя в формате float64
    movies_ratings_df_new['liked'] = movies_ratings_df_new['like'].map({True: 1.0, False: 0.0})

    # форматирую датафрейм: только нужные колонки
    movies_ratings_df_new = movies_ratings_df_new[['new_user_id', 'movie_id', 'item_id', 'liked']]
    movies_ratings_df_new = movies_ratings_df_new.rename(columns={'new_user_id': 'user_id', 'new_movie_id': 'item_id'})
    
    return movies_ratings_df_new

### Книги

In [5]:
books_dataset = Dataset.get(
    dataset_id='3f5f71caa9ae4709b827c5dd88e7253a',  
    only_completed=True, 
    auto_create=False
)
books_dataset_local_path = books_dataset.get_local_copy()
books_data_df = pd.read_csv(f'{books_dataset_local_path}/books.csv', sep='\t')
books_ratings_df = pd.read_csv(f'{books_dataset_local_path}/ratings.csv', sep='\t')
books_genres_df = pd.read_csv(f'{books_dataset_local_path}/genres.csv', sep='\t')

In [6]:
def get_formatted_books_ratings():
    # форматирую датафрейм: последовательные идентификаторы для пользователей
    books_ratings_df_new = books_ratings_df.copy()
    books_ratings_df_new_users = books_ratings_df_new['user_id'].unique()
    books_ratings_df_new_users = pd.DataFrame(books_ratings_df_new_users, columns=['user_id'])
    books_ratings_df_new_users['new_user_id'] = books_ratings_df_new_users.apply(lambda x: x.index)
    books_ratings_df_new = books_ratings_df_new.merge(books_ratings_df_new_users, on='user_id', how='left')

    # форматирую датафрейм: последовательные идентификаторы для книг
    books_ratings_df_new_books = books_ratings_df_new['isbn'].unique()
    books_ratings_df_new_books = pd.DataFrame(books_ratings_df_new_books, columns=['isbn'])
    books_ratings_df_new_books['item_id'] = books_ratings_df_new_books.apply(lambda x: x.index)
    books_ratings_df_new = books_ratings_df_new.merge(books_ratings_df_new_books, on='isbn', how='left')

    # форматирую датафрейм: выставляю оценку пользователя в формате float64
    books_ratings_df_new['liked'] = books_ratings_df_new['like'].map({True: 1.0, False: 0.0})

    # форматирую датафрейм: только нужные колонки
    books_ratings_df_new = books_ratings_df_new[['new_user_id', 'isbn', 'item_id', 'liked']]
    books_ratings_df_new = books_ratings_df_new.rename(columns={'new_user_id': 'user_id'})
    
    return books_ratings_df_new

### Музыка

In [7]:
tracks_dataset = Dataset.get(
    dataset_id='dc72aebbbee64810a590a4c5bd82c19b',  
    only_completed=True, 
    auto_create=False
)
tracks_dataset_local_path = tracks_dataset.get_local_copy()
tracks_data_df = pd.read_csv(f'{tracks_dataset_local_path}/tracks.csv', sep='\t')
tracks_ratings_df = pd.read_csv(f'{tracks_dataset_local_path}/ratings.csv', sep='\t')
tracks_genres_df = pd.read_csv(f'{tracks_dataset_local_path}/genres.csv', sep='\t')

In [8]:
def get_formatted_tracks_ratings():
    # форматирую датафрейм: последовательные идентификаторы для пользователей
    tracks_ratings_df_new = tracks_ratings_df.copy()
    tracks_ratings_df_new_users = tracks_ratings_df_new['user_id'].unique()
    tracks_ratings_df_new_users = pd.DataFrame(tracks_ratings_df_new_users, columns=['user_id'])
    tracks_ratings_df_new_users['new_user_id'] = tracks_ratings_df_new_users.apply(lambda x: x.index)
    tracks_ratings_df_new = tracks_ratings_df_new.merge(tracks_ratings_df_new_users, on='user_id', how='left')

    # форматирую датафрейм: последовательные идентификаторы для треков
    tracks_ratings_df_new_tracks = tracks_ratings_df_new['track_id'].unique()
    tracks_ratings_df_new_tracks = pd.DataFrame(tracks_ratings_df_new_tracks, columns=['track_id'])
    tracks_ratings_df_new_tracks['item_id'] = tracks_ratings_df_new_tracks.apply(lambda x: x.index)
    tracks_ratings_df_new = tracks_ratings_df_new.merge(tracks_ratings_df_new_tracks, on='track_id', how='left')

    # форматирую датафрейм: выставляю оценку пользователя в формате float64
    tracks_ratings_df_new['liked'] = tracks_ratings_df_new['like'].map({True: 1.0, False: 0.0})

    # форматирую датафрейм: только нужные колонки
    tracks_ratings_df_new = tracks_ratings_df_new[['new_user_id', 'track_id', 'item_id', 'liked']]
    tracks_ratings_df_new = tracks_ratings_df_new.rename(columns={'new_user_id': 'user_id'})
    
    return tracks_ratings_df_new

## Коллаборативная фильтрация на основе матричной факторизации

In [9]:
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error

# предсказание рейтинга
def predict_rating_svds(predicted_ratings, user_id, item_id):
    predicted_rating = predicted_ratings[user_id][item_id]
    return predicted_rating

# тестирование с помощью метрики RMSE
def test_algorithm_svds(predicted_ratings, test_data):
    # получение фактических оценок из тестовой выборки
    test_ratings = []
    for index, row in test_data.iterrows():
        user_id = int(row['user_id'])
        item_id = int(row['item_id'])
        rating = row['liked']
        test_ratings.append((user_id, item_id, rating))

    # получение предсказанных оценок из матрицы предсказанных оценок
    result_ratings = []
    for index, row in test_data.iterrows():
        user_id = int(row['user_id'])
        item_id = int(row['item_id'])
        predicted_rating = predict_rating_svds(predicted_ratings, user_id, item_id)
        result_ratings.append((user_id, item_id, predicted_rating))

    # расчет RMSE между фактическими и предсказанными оценками
    rmse = np.sqrt(mean_squared_error(test_ratings, result_ratings))
    return rmse

# формирование матрицы пользователь-элемент-рейтинг
def make_user_item_matrix_svds(ratings_df):
    user_item_matrix = ratings_df.pivot_table(index='user_id', columns='item_id', values='liked')
    return user_item_matrix.fillna(0)

### Фильмы

In [10]:
def run_movies_svds():
    movies_ratings_df_new = get_formatted_movies_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='SVDS_Movies', 
        tags=['SVDS', 'Movies'])

    log = Logger.current_logger()

    # готовлю матрицу для использования в svds
    user_movies_matrix = make_user_item_matrix_svds(movies_ratings_df_new)
    
    # разделение данных на обучающую и тестовую выборки
    train_data = movies_ratings_df_new.sample(frac=0.8, random_state=RANDOM_STATE)
    test_data = movies_ratings_df_new.drop(train_data.index)

    # расчет предсказанных оценок на основе обучающей выборки
    matrix = csr_matrix(user_movies_matrix.values)
    U, sigma, Vt = svds(matrix, k=50)
    sigma = np.diag(sigma)
    predicted_ratings = np.abs(np.round(np.dot(np.dot(U, sigma), Vt), 0))
    
    # тестирование алгоритма на тестовой выборке
    rmse = test_algorithm_svds(predicted_ratings, test_data)
    log.report_single_value(name='RMSE', value=round(rmse, 2))
    
    task.close()

    print("Movies RMSE: %.2f" % rmse)

### Книги

In [11]:
def run_books_svds():
    books_ratings_df_new = get_formatted_books_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='SVDS_Books', 
        tags=['SVDS', 'Books'])

    log = Logger.current_logger()
    
    # готовлю матрицу для использования в svds
    user_books_matrix = make_user_item_matrix_svds(books_ratings_df_new)
    
    # разделение данных на обучающую и тестовую выборки
    train_data = books_ratings_df_new.sample(frac=0.8, random_state=RANDOM_STATE)
    test_data = books_ratings_df_new.drop(train_data.index)

    # расчет предсказанных оценок на основе обучающей выборки
    matrix = csr_matrix(user_books_matrix.values)
    U, sigma, Vt = svds(matrix, k=50)
    sigma = np.diag(sigma)
    predicted_ratings = np.abs(np.round(np.dot(np.dot(U, sigma), Vt), 0))
    
    # тестирование алгоритма на тестовой выборке
    rmse = test_algorithm_svds(predicted_ratings, test_data)
    log.report_single_value(name='RMSE', value=round(rmse, 2))
    
    task.close()
    
    print("Books RMSE: %.2f" % rmse)

### Треки

In [12]:
def run_tracks_svds():
    tracks_ratings_df_new = get_formatted_tracks_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='SVDS_Tracks', 
        tags=['SVDS', 'Tracks'])

    log = Logger.current_logger()
    
    # готовлю матрицу для использования в svds
    user_tracks_matrix = make_user_item_matrix_svds(tracks_ratings_df_new)
    
    # разделение данных на обучающую и тестовую выборки
    train_data = tracks_ratings_df_new.sample(frac=0.8, random_state=RANDOM_STATE)
    test_data = tracks_ratings_df_new.drop(train_data.index)

    # расчет предсказанных оценок на основе обучающей выборки
    matrix = csr_matrix(user_tracks_matrix.values)
    U, sigma, Vt = svds(matrix, k=50)
    sigma = np.diag(sigma)
    predicted_ratings = np.abs(np.round(np.dot(np.dot(U, sigma), Vt), 0))
    
    # тестирование алгоритма на тестовой выборке
    rmse = test_algorithm_svds(predicted_ratings, test_data)
    log.report_single_value(name='RMSE', value=round(rmse, 2))
    
    task.close()
    
    print("Tracks RMSE: %.2f" % rmse)

### Проверка

In [13]:
def run_svds():
    run_movies_svds()
    run_books_svds()
    run_tracks_svds()

In [14]:
#run_svds()

## Метод k-ближайших соседей

In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def predict_rating_k_neighbors(data, log):
    # Разделение данных на обучающую и тестовую выборки
    train, test = train_test_split(data, test_size=0.3, random_state=RANDOM_STATE)

    # Создание матрицы признаков
    X_train = train.drop(['liked'], axis=1)
    y_train = train['liked']
    X_test = test.drop(['liked'], axis=1)
    y_test = test['liked']

    # Создание объекта модели k ближайших соседей
    model = KNeighborsRegressor(n_neighbors=5)

    # Обучение модели
    model.fit(X_train, y_train)

    # Предсказание на тестовой выборке
    y_pred = model.predict(X_test)

    # Вычисление RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    log.report_single_value(name='RMSE', value=round(rmse, 2))
        
    print("RMSE: %.2f" % rmse)


### Фильмы

In [16]:
def run_movies_k_neighbors():
    movies_ratings_df_new = get_formatted_movies_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='KNeighbors_Movies', 
        tags=['KNeighbors', 'Movies'])

    log = Logger.current_logger()
    
    print('MOVIES: ')
    predict_rating_k_neighbors(movies_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Книги

In [17]:
def run_books_k_neighbors():
    books_ratings_df_new = get_formatted_books_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='KNeighbors_Books', 
        tags=['KNeighbors', 'Books'])

    log = Logger.current_logger()
    
    print('BOOKS: ')
    predict_rating_k_neighbors(books_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Треки

In [18]:
def run_tracks_k_neighbors():
    tracks_ratings_df_new = get_formatted_tracks_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='KNeighbors_Tracks', 
        tags=['KNeighbors', 'Tracks'])

    log = Logger.current_logger()
    
    print('TRACKS: ')
    predict_rating_k_neighbors(tracks_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Проверка

In [19]:
def run_k_neighbors():
    run_movies_k_neighbors()
    run_books_k_neighbors()
    run_tracks_k_neighbors()    

In [20]:
# run_k_neighbors()

## Линейная регрессия

In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

with_grid_search = True
with_random_search = False

def predict_rating_linear_regression(data, log, search_type=None):
    # Разделение данных на обучающую и тестовую выборки
    train, test = train_test_split(data, test_size=0.3, random_state=RANDOM_STATE)

    # Создание матрицы признаков
    X_train = train.drop(['liked'], axis=1)
    y_train = train['liked']
    X_test = test.drop(['liked'], axis=1)
    y_test = test['liked']

    # Создание объекта модели линейной регрессии
    model = LinearRegression()
    
    if search_type == 1:
        # Определение сетки гиперпараметров
        param_grid = {'fit_intercept': [True, False],
                      'copy_X': [True, False],
                      'n_jobs': [-1, None],
                      'positive': [True, False]}

        # Создание объекта Grid Search
        model = GridSearchCV(model, param_grid=param_grid, cv=5)

    # Обучение модели
    model.fit(X_train, y_train)

    # Использование порогового значения для принятия решения
#     threshold = 0.5
#     y_pred = np.where(model.predict(X_test) > threshold, 1.0, 0.0)
    y_pred = model.predict(X_test)

    # Вычисление RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    log.report_single_value(name='RMSE', value=round(rmse, 2))
        
    print("RMSE: %.2f" % rmse)
    
    if search_type == 1:
        # Вывод лучших гиперпараметров
        print("Best hyperparameters: ", model.best_params_)


### Фильмы

In [22]:
def run_movies_linear_regression():
    movies_ratings_df_new = get_formatted_movies_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='LinearRegression_Movies', 
        tags=['LinearRegression', 'Movies'])

    log = Logger.current_logger()
    
    print('MOVIES: ')
    predict_rating_linear_regression(movies_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()


### Книги

In [23]:
def run_books_linear_regression():
    books_ratings_df_new = get_formatted_books_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='LinearRegression_Books', 
        tags=['LinearRegression', 'Books'])

    log = Logger.current_logger()
    
    print('BOOKS: ')
    predict_rating_linear_regression(books_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Треки

In [24]:
def run_tracks_linear_regression():
    tracks_ratings_df_new = get_formatted_tracks_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='LinearRegression_Tracks', 
        tags=['LinearRegression', 'Tracks'])

    log = Logger.current_logger()
    
    print('TRACKS: ')
    predict_rating_linear_regression(tracks_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Проверка

In [25]:
def run_linear_regression():
    run_movies_linear_regression()
    run_books_linear_regression()
    run_tracks_linear_regression()

In [26]:
#run_linear_regression()

## XGBoost

In [27]:
import pandas as pd
import numpy as np
from numpy.random import uniform, randint
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def predict_rating_xgboost(data, log, search_type=None):
    # Разделение данных на обучающую и тестовую выборки
    train, test = train_test_split(data, test_size=0.3, random_state=RANDOM_STATE)

    # Создание матрицы признаков
    X_train = train.drop(['liked'], axis=1)
    y_train = train['liked']
    X_test = test.drop(['liked'], axis=1)
    y_test = test['liked']

    # Создание и обучение модели
    model = xgb.XGBRegressor(max_depth=3, learning_rate=0.1, objective='reg:squarederror')
    model.fit(X_train, y_train)

    # Предсказание на тестовых данных
    y_pred = model.predict(X_test)

    # Вычисление RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    log.report_single_value(name='RMSE', value=round(rmse, 2))
    
    print("RMSE: %.2f" % rmse)

    if search_type == 1:
        # Вывод лучших гиперпараметров
        print("Best hyperparameters: ", model.best_params_)
        

### Фильмы

In [28]:
def run_movies_xgboost():
    movies_ratings_df_new = get_formatted_movies_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='XGB_Movies', 
        tags=['XGB', 'Movies'])

    log = Logger.current_logger()
    
    print('MOVIES: ')
    predict_rating_xgboost(movies_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Книги

In [29]:
def run_books_xgboost():
    books_ratings_df_new = get_formatted_books_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='XGB_Books', 
        tags=['XGB', 'Books'])

    log = Logger.current_logger()
    
    print('BOOKS: ')
    predict_rating_xgboost(books_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Треки

In [30]:
def run_tracks_xgboost():
    tracks_ratings_df_new = get_formatted_tracks_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='XGB_Tracks', 
        tags=['XGB', 'Tracks'])

    log = Logger.current_logger()
    
    print('TRACKS: ')
    predict_rating_xgboost(tracks_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Проверка

In [31]:
def run_xgboost():
    run_movies_xgboost()
    run_books_xgboost()
    run_tracks_xgboost()

In [32]:
# run_xgboost()

## CatBoost

In [33]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def predict_rating_catboost(data, log):
    # Разделение данных на обучающую и тестовую выборки
    train, test = train_test_split(data, test_size=0.3, random_state=RANDOM_STATE)

    # Создание матрицы признаков
    X_train = train.drop(['liked'], axis=1)
    y_train = train['liked']
    X_test = test.drop(['liked'], axis=1)
    y_test = test['liked']

    # Обучение модели
    model = CatBoostRegressor(iterations=1000, learning_rate=0.1, loss_function='RMSE')
    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

    # Предсказание на тестовой выборке
    y_pred = model.predict(X_test)

    # Вычисление RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    log.report_single_value(name='RMSE', value=round(rmse, 2))

    print("RMSE: %.2f" % rmse)


### Фильмы

In [34]:
def run_movies_catboost():
    movies_ratings_df_new = get_formatted_movies_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='CAT_Movies', 
        tags=['CAT', 'Movies'])

    log = Logger.current_logger()
    
    print('MOVIES: ')
    predict_rating_catboost(movies_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Книги

In [35]:
def run_books_catboost():
    books_ratings_df_new = get_formatted_books_ratings()
    
    task = Task.init(
        project_name='MoodStream', 
        task_name='CAT_Books', 
        tags=['CAT', 'Books'])

    log = Logger.current_logger()
    
    print('BOOKS: ')
    predict_rating_catboost(books_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Треки

In [36]:
def run_tracks_catboost():
    tracks_ratings_df_new = get_formatted_tracks_ratings()

    task = Task.init(
        project_name='MoodStream', 
        task_name='CAT_Tracks', 
        tags=['CAT', 'Tracks'])

    log = Logger.current_logger()
    
    print('TRACKS: ')
    predict_rating_catboost(tracks_ratings_df_new[['user_id', 'item_id', 'liked']], log)
    task.close()

### Проверка

In [37]:
def run_catboost():
    run_movies_catboost()
    run_books_catboost()
    run_tracks_catboost()

In [38]:
#run_catboost()