In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import io
import json

import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import median_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

import itertools

from tqdm import tqdm
import pickle

import mlflow
import boto3
import psycopg2

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["MLFLOW_TRACKING_URI"] = "http://84.201.153.30:8000/"

In [None]:
with open('/home/dmitry-ds/postgresql-credentials/creds.json', 'r') as f:
    creds = json.load(f)

In [None]:
session = boto3.session.Session()
s3 = session.client(service_name = 's3', endpoint_url = 'https://storage.yandexcloud.net', region_name = 'ru-central1')

In [None]:
conn = psycopg2.connect(f"""host=c-c9qm2f2d6d1lkiqst4qn.rw.mdb.yandexcloud.net
port=6432
sslmode=disable
dbname=anime-rec-sys-db
user={creds['user']}
password={creds['password']}
target_session_attrs=read-write
                        """)

In [None]:
def test_user_ids(s3):
    
    get_object_response = s3.get_object(Bucket = 'anime-rec-sys-data', Key = 'user_id_test.npy')
    user_id_test = np.load(io.BytesIO(get_object_response['Body'].read()))
    
    return user_id_test

In [None]:
def get_item_data(s3):
    
    get_object_response = s3.get_object(Bucket = 'anime-rec-sys-data', Key = 'anime_selected.csv')
    anime_selected = pd.read_csv(io.BytesIO(get_object_response['Body'].read()))
    
    get_object_response = s3.get_object(Bucket = 'anime-rec-sys-data', Key = 'anime_synopsis_emb_2.csv')
    anime_synopsis_emb = pd.read_csv(io.BytesIO(get_object_response['Body'].read()))
    
    get_object_response = s3.get_object(Bucket = 'anime-rec-sys-data', Key = 'lightfm_emb_df_50e.csv')
    lightfm_emb_df = pd.read_csv(io.BytesIO(get_object_response['Body'].read()))
    
    return anime_selected, anime_synopsis_emb, lightfm_emb_df

In [None]:
def get_user_data(connection):
    
    sql_query = pd.read_sql_query("SELECT * FROM selectedratings;", connection)
    rating_selected = pd.DataFrame(sql_query, columns = ['user_id', 'anime_id', 'rating'])
    
    sql_query = pd.read_sql_query("SELECT * FROM userpreferences;", connection)
    users = pd.DataFrame(sql_query)
    users.columns = ['user_id', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia',
                     'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai',
                     'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts',
                     'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police',
                     'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen',
                     'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life',
                     'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Unknown',
                     'Vampire', 'Yaoi', 'Yuri', 'mean_rating']
    
    connection.close()
    
    return rating_selected, users

In [None]:
def create_lightgbm_dataset(rating_selected, users, lightfm_emb_df, anime_synopsis_emb):
    
    lgb_dataset = rating_selected.copy()
    lgb_dataset = lgb_dataset.merge(users[users.columns], on = 'user_id', how = 'left')
    lgb_dataset = lgb_dataset.merge(lightfm_emb_df[lightfm_emb_df.columns], on = 'anime_id', how = 'left')
    lgb_dataset = lgb_dataset.merge(anime_synopsis_emb[anime_synopsis_emb.columns], on = 'anime_id', how = 'left')
    
    return lgb_dataset

In [None]:
anime_selected, anime_synopsis_emb, lightfm_emb_df = get_item_data(s3)

In [None]:
rating_selected, users = get_user_data(conn)

In [None]:
lgb_dataset = create_lightgbm_dataset(rating_selected, users, lightfm_emb_df, anime_synopsis_emb)

In [None]:
user_id_test = test_user_ids(s3)

In [None]:
user_id_train = lgb_dataset['user_id'].unique()[~np.isin(lgb_dataset['user_id'].unique(), user_id_test)]

In [None]:
user_id_train, user_id_val = train_test_split(user_id_train, test_size = 0.15, random_state = 586) 

In [None]:
print(f'Num train {len(user_id_train)}')
print(f'Num val {len(user_id_val)}')
print(f'Num test {len(user_id_test)}')

In [None]:
train = lgb_dataset[lgb_dataset['user_id'].isin(user_id_train)]
val = lgb_dataset[lgb_dataset['user_id'].isin(user_id_val)]
test = lgb_dataset[lgb_dataset['user_id'].isin(user_id_test)]

In [None]:
lgb_train = lgb.Dataset(train.iloc[:, 3:], train['rating'])
lgb_val = lgb.Dataset(val.iloc[:, 3:], val['rating'])

In [None]:
# params = {'objective': 'regression',
#           'max_depth': 8,
#           'n_estimators': 2000,
#           'num_leaves': 2**8-1,
#           'learning_rate': 0.01,
#           'colsample_bytree': 0.8,
#           'subsample': 0.8,
#           'early_stopping_rounds': 20,
#           'random_state': 42,
#           'n_jobs': 8
#          }

In [None]:
# with open('/home/dmitry-ds/rec-sys/Anime-recommender-engine/app/models/LightGBM-v5.pickle', 'wb') as file:
#     pickle.dump(lgbm_regressor, file)

In [None]:
# lgbm_regressor = lgb.train(params,
#                            lgb_train,
#                            valid_sets = lgb_val,
#                            verbose_eval = 10
#                           )

In [None]:
with open('/home/dmitry-ds/rec-sys/Anime-recommender-engine/app/models/LightGBM-v5.pickle', 'rb') as file:
    lgbm_regressor = pickle.load(file)

In [None]:
test_preds = lgbm_regressor.predict(test.iloc[:, 3:])

In [None]:
median_absolute_error(test['rating'], test_preds)

In [None]:
mlflow.start_run()

In [None]:
mlflow.lightgbm.log_model(lgbm_regressor, 'LightGBM-regressor')

In [None]:
mae = median_absolute_error(test['rating'], test_preds)

In [None]:
mlflow.log_metric('test MAE', mae)

In [None]:
bootstrap_iterations = 100

In [None]:
prod_scores = pd.DataFrame(data = {'test MAE': 0.0}, index=range(bootstrap_iterations))

In [None]:
test['predicted_rating'] = test_preds

In [None]:
prod_model_results_df = test[['rating', 'predicted_rating']]

In [None]:
for i in range(bootstrap_iterations):
    prod_sample = prod_model_results_df.sample(frac=1.0, replace=True)
    prod_scores.loc[i, 'test MAE'] = median_absolute_error(prod_sample['rating'], prod_sample['predicted_rating'])

In [None]:
prod_score_mean = prod_scores['test MAE'].mean()
prod_score_std = prod_scores['test MAE'].std()

In [None]:
mlflow.log_metric('boostrap MAE', prod_score_mean)
mlflow.log_metric('boostrap std', prod_score_std)

In [None]:
users.to_csv('users.csv', index = False)
rating_selected.to_csv('rating_selected.csv', index = False)

In [None]:
mlflow.log_artifact('users.csv')
mlflow.log_artifact('rating_selected.csv')

In [None]:
os.remove('users.csv')
os.remove('rating_selected.csv')

In [None]:
mlflow.end_run()

In [None]:
mlflow_run = mlflow.search_runs().iloc[0]

In [None]:
model_name = 'LightGBM-ratings_predictor'

In [None]:
new_model_version = mlflow.register_model(f'runs:/{mlflow_run.run_id}/LightGBM-regressor', model_name)

In [None]:
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
  name = model_name,
  version = new_model_version.version,
  stage = "Production"
)

In [None]:
def get_prod_model(model_name):
    prod_model = mlflow.lightgbm.load_model(f'models:/{model_name}/Production')
    return prod_model

In [None]:
prod_model = get_prod_model(model_name)