# MoodStream: Финальная модель

Основная задача заключается в подборе модели с удовлетворительным процентом ошибок и приемлемым временем выполнения.

---

## Окружение

In [1]:
import pandas as pd
import numpy as np
import dotenv
from clearml import Dataset, Task, Logger
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [2]:
%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
    
%load_ext dotenv
%dotenv

RANDOM_STATE=43
THRESHOLD=0.5

env: CLEARML_WEB_HOST=https://app.clear.ml
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml


## Данные

### Фильмы

In [3]:
movies_dataset = Dataset.get(
    dataset_id='f93c573e641f460598ee7cd20ca70f11',  
    only_completed=True, 
    auto_create=False
)
movies_dataset_local_path = movies_dataset.get_local_copy()
movies_data_df = pd.read_csv(f'{movies_dataset_local_path}/movies.csv', sep='\t')
movies_ratings_df = pd.read_csv(f'{movies_dataset_local_path}/ratings.csv', sep='\t')
movies_genres_df = pd.read_csv(f'{movies_dataset_local_path}/genres.csv', sep='\t')

### Книги

In [4]:
books_dataset = Dataset.get(
    dataset_id='43da819d3714444e93908b31fadb6243',  
    only_completed=True, 
    auto_create=False
)
books_dataset_local_path = books_dataset.get_local_copy()
books_data_df = pd.read_csv(f'{books_dataset_local_path}/books.csv', sep='\t')
books_ratings_df = pd.read_csv(f'{books_dataset_local_path}/ratings.csv', sep='\t')
books_genres_df = pd.read_csv(f'{books_dataset_local_path}/genres.csv', sep='\t')


### Треки

In [5]:
tracks_dataset = Dataset.get(
    dataset_id='a770d29c9e504f0d9cbe143fea93ab6c',  
    only_completed=True, 
    auto_create=False
)
tracks_dataset_local_path = tracks_dataset.get_local_copy()
tracks_data_df = pd.read_csv(f'{tracks_dataset_local_path}/tracks.csv', sep='\t')
tracks_ratings_df = pd.read_csv(f'{tracks_dataset_local_path}/ratings.csv', sep='\t')
tracks_genres_df = pd.read_csv(f'{tracks_dataset_local_path}/genres.csv', sep='\t')

## Подготовка данных

In [6]:
def add_genres(genres_df, items_df):
    df = items_df.copy()
    genres = []
    for i, row in genres_df.iterrows():
        df[row['genre']] = 0.0
        genres.append(row['genre'])

    for i, row in df.iterrows():
        genre_list = row['genre'].split(',')
        for genre in genre_list:
            if genre in genres:
                df.at[i, genre] = 1.0

    
    return df.drop('genre', axis=1)

In [7]:
def add_user_genres(genres_df, items_df):
    genres_list = []
    for i, row in genres_df.iterrows():
        genres_list.append(row['genre'])
        
    user_genres = items_df[items_df['liked'] == 1]
    user_genres = user_genres.drop(['liked', 'item_id'], axis=1)
    user_genres = user_genres.drop_duplicates()

    user_genres = pd.pivot_table(user_genres, values=genres_list, index=['user_id'], aggfunc='max')
    user_genres['user_id'] = user_genres.index
    user_genres = user_genres.reset_index(drop=True)
    
    return user_genres

### Фильмы

In [8]:
movies_ratings_with_genres = movies_ratings_df.merge(movies_data_df[['item_id', 'genres']], on='item_id', how='left')
movies_ratings_with_genres = movies_ratings_with_genres.rename(columns={'genres': 'genre'})
movies_ratings_with_genres = add_genres(movies_genres_df, movies_ratings_with_genres)
movies_user_genres = add_user_genres(movies_genres_df, movies_ratings_with_genres)
movies_ratings_with_user_genres = movies_ratings_with_genres.merge(movies_user_genres, on='user_id', how='left')
movies_ratings_with_user_genres = movies_ratings_with_user_genres.fillna(0.0)
movies_ratings_with_user_genres

Unnamed: 0,user_id,liked,item_id,action_x,adventure_x,animation_x,comedy_x,crime_x,documentary_x,drama_x,...,history_y,horror_y,music_y,mystery_y,romance_y,science fiction_y,thriller_y,tv movie_y,war_y,western_y
0,58365,1,275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1,58370,1,275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
2,58371,1,275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
3,2,0,275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,97176,1,275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8606093,145558,1,42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
8606094,147162,1,42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
8606095,213259,0,42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8606096,230417,0,418,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


### Книги

In [9]:
books_ratings_with_genres = books_ratings_df.merge(books_data_df[['item_id', 'genres']], on='item_id', how='left')
books_ratings_with_genres = books_ratings_with_genres.rename(columns={'genres': 'genre'})
books_ratings_with_genres = add_genres(books_genres_df, books_ratings_with_genres)
books_user_genres = add_user_genres(books_genres_df, books_ratings_with_genres)
books_ratings_with_user_genres = books_ratings_with_genres.merge(books_user_genres, on='user_id', how='left')
books_ratings_with_user_genres = books_ratings_with_user_genres.fillna(0.0)
books_ratings_with_user_genres

Unnamed: 0,user_id,liked,item_id,action_x,countries_x,fantasy_x,history_x,novel_x,science_x,world literature_x,action_y,countries_y,fantasy_y,history_y,novel_y,science_y,world literature_y
0,276747,1,270,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,278418,0,270,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1435,0,270,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7346,0,270,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,8936,0,270,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40421,273813,0,3813,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40422,274004,1,3834,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
40423,274301,1,3856,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
40424,274308,0,2795,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Треки

In [10]:
tracks_ratings_with_genres = tracks_ratings_df.merge(tracks_data_df[['item_id', 'genre']], on='item_id', how='left')
tracks_ratings_with_genres = add_genres(tracks_genres_df, tracks_ratings_with_genres)
tracks_user_genres = add_user_genres(tracks_genres_df, tracks_ratings_with_genres)
tracks_ratings_with_user_genres = tracks_ratings_with_genres.merge(tracks_user_genres, on='user_id', how='left')
tracks_ratings_with_user_genres = tracks_ratings_with_user_genres.fillna(0.0)
tracks_ratings_with_user_genres

Unnamed: 0,user_id,liked,item_id,alternative_x,blues_x,classical_x,country_x,dance_x,electronic_x,folk_x,...,jazz_y,opera_y,pop_y,r&b_y,rap_y,reggae_y,reggaeton_y,rock_y,ska_y,soul_y
0,3720277,0,2759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3736777,1,2759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,3738042,1,2759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3734766,1,2759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,3732204,1,2759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964159,3739032,0,84378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
964160,3739032,0,86976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
964161,3739032,1,78626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
964162,3739032,0,81108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Обучение

In [11]:
def get_data(df):
    # Разделение данных на обучающую и тестовую выборки
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

    X_train = train_data.drop('liked', axis=1).drop('user_id', axis=1)
    y_train = train_data['liked']

    X_test = test_data.drop('liked', axis=1).drop('user_id', axis=1)
    y_test = test_data['liked']

    return train_data, test_data, X_train, y_train, X_test, y_test

In [12]:
import numpy as np
from sklearn.metrics import average_precision_score

def print_mapk(test_data):
    # Определение фактических и предсказанных значений целевой переменной для каждого пользователя
    y_true = []
    y_pred = []
    for user_id in test_data['user_id'].unique():
        user_data = test_data[test_data['user_id'] == user_id]
        y_true.append(user_data['liked'].values)
        y_pred.append(user_data['predicted_liked'].values)

    # Вычисление метрики MAP@K
    def mapk(y_true, y_pred, k):
        """
        Вычисляет среднюю точность ранжирования (MAP) для каждого пользователя.
        y_true - массив фактических значений целевой переменной
        y_pred - массив предсказанных значений целевой переменной
        k - количество рекомендаций
        """
        mapk_scores = []
        for i in range(len(y_true)):
            true = y_true[i]
            pred = y_pred[i]
            # Получение индексов рекомендаций, отсортированных по убыванию предсказанных значений
            idx = np.argsort(pred)[::-1][:k]
            # Проверка, есть ли хотя бы один фактический положительный пример в рекомендациях
            if np.sum(true[idx]) == 0:
                mapk_scores.append(0)
            else:
                # Вычисление средней точности ранжирования (AP) для данного пользователя
                ap = average_precision_score(true[idx], pred[idx])
                mapk_scores.append(ap)
        # Вычисление средней точности ранжирования (MAP) для всех пользователей
        return np.mean(mapk_scores)

    mapk_score3 = mapk(y_true, y_pred, k=3)
    mapk_score10 = mapk(y_true, y_pred, k=10)
    print(f"MAP@3: {mapk_score3:.4f}")
    print(f"MAP@10: {mapk_score10:.4f}")


In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split


def get_trained_model_classifier(df):
    train_data, test_data, X_train, y_train, X_test, y_test = get_data(df)

    # Обучение модели с учетом весов классов
    scale_pos_weight = len(train_data[train_data['liked'] == 0]) / len(train_data[train_data['liked'] == 1])
    xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight)
    xgb_model.fit(X_train, y_train)
    
    # Получение предсказаний для тестовой выборки
    y_pred = xgb_model.predict_proba(X_test)[:, 1]

    # Вычисление метрик
    accuracy = accuracy_score(y_test, y_pred > THRESHOLD)
    precision = precision_score(y_test, y_pred > THRESHOLD, average='weighted')
    recall = recall_score(y_test, y_pred > THRESHOLD, average='weighted')
    f1 = f1_score(y_test, y_pred > THRESHOLD, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred, average='weighted')

    # Вывод результатов
    print("CLASSIFIER")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print()

    return xgb_model


In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split


def get_trained_model_classifier(df):
    train_data, test_data, X_train, y_train, X_test, y_test = get_data(df)

    # Обучение модели с учетом весов классов
    scale_pos_weight = len(train_data[train_data['liked'] == 0]) / len(train_data[train_data['liked'] == 1])
    xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight)
    xgb_model.fit(X_train, y_train)
    
    # Получение предсказаний для тестовой выборки
    y_pred = xgb_model.predict_proba(X_test)[:, 1]

    # Вычисление метрик
    accuracy = accuracy_score(y_test, y_pred > THRESHOLD)
    precision = precision_score(y_test, y_pred > THRESHOLD, average='weighted')
    recall = recall_score(y_test, y_pred > THRESHOLD, average='weighted')
    f1 = f1_score(y_test, y_pred > THRESHOLD, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred, average='weighted')

    # Вывод результатов
    print("CLASSIFIER")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print()

    return xgb_model


In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split


def get_trained_model_ranker(df):
    train_data, test_data, X_train, y_train, X_test, y_test = get_data(df)
    
    xgb_model = xgb.XGBRanker()
    xgb_model.fit(X_train, y_train, group=[len(X_train)])
    
    # Получение предсказаний для тестовой выборки
    y_pred = xgb_model.predict(X_test)
    
    # Вычисление метрик
    accuracy = accuracy_score(y_test, y_pred > THRESHOLD)
    precision = precision_score(y_test, y_pred > THRESHOLD)
    recall = recall_score(y_test, y_pred > THRESHOLD)
    f1 = f1_score(y_test, y_pred > THRESHOLD)
    roc_auc = roc_auc_score(y_test, y_pred)

    print_mapk(test_data.assign(predicted_liked=y_pred))
    
    # Вывод результатов
    print("RANKER")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print()

    return xgb_model


In [16]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

def get_trained_model_combine(df):
    classifier = get_trained_model_classifier(df)
    train_data, test_data, X_train, y_train, X_test, y_test = get_data(df)
    
    train_classifier_result = classifier.predict(X_train)
    xgb_model = xgb.XGBRanker()
    xgb_model.fit(X_train.assign(classifier_result=train_classifier_result), y_train, group=[len(X_train)])

    # Получение предсказаний для тестовой выборки
    test_classifier_result = classifier.predict(X_test)
    y_pred = xgb_model.predict(X_test.assign(classifier_result=test_classifier_result))
    
    # Вычисление метрик
    accuracy = accuracy_score(y_test, y_pred > THRESHOLD)
    precision = precision_score(y_test, y_pred > THRESHOLD)
    recall = recall_score(y_test, y_pred > THRESHOLD)
    f1 = f1_score(y_test, y_pred > THRESHOLD)
    roc_auc = roc_auc_score(y_test, y_pred)

    # Вывод результатов
    print("COMBINE")
    print_mapk(test_data.assign(predicted_liked=y_pred))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("")

    return xgb_model



### Фильмы

In [28]:
# movies_model_classifier = get_trained_model_classifier(movies_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.6741
Precision: 0.8748
Recall: 0.6741
F1-score: 0.7448
ROC AUC: 0.6843



In [18]:
# movies_model_ranker = get_trained_model_ranker(movies_ratings_with_user_genres)

MAP@3: 0.9530
MAP@10: 0.9470
RANKER
Accuracy: 0.6632
Precision: 0.9427
Recall: 0.6724
F1-score: 0.7849
ROC AUC: 0.6779



In [31]:
# result_model_movies = get_trained_model_combine(movies_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.6741
Precision: 0.8748
Recall: 0.6741
F1-score: 0.7448
ROC AUC: 0.6843

COMBINE
MAP@3: 0.9531
MAP@10: 0.9472
Accuracy: 0.6739
Precision: 0.9437
Recall: 0.6840
F1-score: 0.7932
ROC AUC: 0.6815



### Книги

In [29]:
# books_model_classifier = get_trained_model_classifier(books_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.8011
Precision: 0.8449
Recall: 0.8011
F1-score: 0.8069
ROC AUC: 0.9092



In [21]:
# books_model_ranker = get_trained_model_ranker(books_ratings_with_user_genres)

MAP@3: 0.4506
MAP@10: 0.4505
RANKER
Accuracy: 0.8042
Precision: 0.6401
Recall: 0.9331
F1-score: 0.7593
ROC AUC: 0.9110



In [32]:
# result_model_books = get_trained_model_combine(books_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.8011
Precision: 0.8449
Recall: 0.8011
F1-score: 0.8069
ROC AUC: 0.9092

COMBINE
MAP@3: 0.4502
MAP@10: 0.4505
Accuracy: 0.8009
Precision: 0.6398
Recall: 0.9114
F1-score: 0.7518
ROC AUC: 0.9061



### Треки

In [30]:
# tracks_model_classifier = get_trained_model_classifier(tracks_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.5846
Precision: 0.6280
Recall: 0.5846
F1-score: 0.5829
ROC AUC: 0.6695



In [24]:
# tracks_model_ranker = get_trained_model_ranker(tracks_ratings_with_user_genres)

MAP@3: 0.7265
MAP@10: 0.6966
RANKER
Accuracy: 0.5758
Precision: 0.4906
Recall: 0.7349
F1-score: 0.5884
ROC AUC: 0.6582



In [33]:
# result_model_tracks = get_trained_model_combine(tracks_ratings_with_user_genres)

CLASSIFIER
Accuracy: 0.5846
Precision: 0.6280
Recall: 0.5846
F1-score: 0.5829
ROC AUC: 0.6695

COMBINE
MAP@3: 0.7252
MAP@10: 0.6972
Accuracy: 0.5853
Precision: 0.4983
Recall: 0.7322
F1-score: 0.5930
ROC AUC: 0.6659



## Сохранение моделей

In [26]:
import pickle

def save_model(model, name):
    with open(f'./models/{name}.pkl', 'wb') as f:
        pickle.dump(model, f)


In [27]:
# save_model(movies_model_classifier, 'movies_classifier')
# save_model(books_model_classifier, 'books_classifier')
# save_model(tracks_model_classifier, 'tracks_classifier')
# save_model(result_model_movies, 'movies')
# save_model(result_model_books, 'books')
# save_model(result_model_tracks, 'tracks')
