In [1]:
import duckdb
import os
import numpy as np
import gensim
import pickle

import mlflow
import mlflow.sklearn
import pandas as pd

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, accuracy_score
from sklearn.model_selection import GridSearchCV




class MovieRatingPredictor:
    def __init__(self, df, word2vec_params=None, model_type_s="", n_jobs=1):
        """
        Initializes the MovieRatingPredictor with the dataframe and optional word2vec parameters.

        Args:
            df: The dataframe containing 'overview', 'genre_ids', and 'vote_average' columns.
            word2vec_params: A dictionary of parameters for the word2vec model (optional).
        """
        self.df = df
        self.word2vec_params = word2vec_params or {'vector_size': 100, 'window': 5, 'min_count': 1, 'sg': 0}
        self.word2vec_model = None
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.mlb = None  # Store the MultiLabelBinarizer
        self.model_type = model_type_s

        print(f"Model Type: {self.model_type}")

        if self.model_type == 'decision_tree':
            self.model = DecisionTreeRegressor()
        elif self.model_type == 'random_forest':
            self.model = RandomForestRegressor()
        elif self.model_type == 'svr':
            self.model = SVR(kernel='sigmoid', C=1.0, epsilon=0.1)
        elif self.model_type == 'mlp':
            self.model = MLPRegressor(
                hidden_layer_sizes=(256, 128, 64), 
                activation='relu', 
                solver='adam', 
                batch_size=512, 
                early_stopping=True
            )
        else:  # Default to Linear Regression
            self.model = LinearRegression(n_jobs=n_jobs)

    def preprocess_text(self, text):
        """
        Preprocesses the text data (implementation depends on your specific needs).

        Args:
            text: The text to be preprocessed.

        Returns:
            The preprocessed text.
        """
        return text.lower().split()

    def train_word2vec(self, field_name):
        """
        Trains the word2vec model on the preprocessed 'overview' data.
        """
        with mlflow.start_run(nested=True, run_name="word2vec_training"):
            print("Training Word2Vec")
            self.df[f'{field_name}_processed'] = self.df[field_name].apply(self.preprocess_text)
            sentences = self.df[f'{field_name}_processed'].tolist()
            self.word2vec_model = gensim.models.Word2Vec(sentences, **self.word2vec_params)

            # Log Word2Vec parameters
            mlflow.log_params(self.word2vec_params)

    def get_embedding(self, text):
        """
        Generates the word2vec embedding for the given text.

        Args:
            text: The text to be embedded.

        Returns:
            The word2vec embedding (mean of word vectors) or a zero vector if no words are found.
        """

        word_vectors = [self.word2vec_model.wv[w] for w in text if w in self.word2vec_model.wv]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(self.word2vec_params['vector_size'])

    def prepare_features(self, text_field_name, classification_field):
        """
        Prepares the features (word2vec embeddings and encoded 'genre_ids') for the model.
        """

        print("Prepare features")
        self.df[f'{text_field_name}_embedding'] = self.df[f'{text_field_name}_processed'].apply(lambda x: self.get_embedding(x))

        # Multi-hot encoding for 'genre_ids'
        self.mlb = MultiLabelBinarizer()  # Initialize and store the MultiLabelBinarizer
        genre_ids_encoded = self.mlb.fit_transform(self.df['genre_ids'])

        X = np.hstack((self.df[f'{text_field_name}_embedding'].tolist(), genre_ids_encoded))
        y = self.df[classification_field]
        return X, y

    def train_and_evaluate(self, text_field, classification_field):
        """
        Trains the regression model and evaluates its performance.
        """
        with mlflow.start_run(nested=True, run_name=f"train_and_evaluate_{self.model.__class__.__name__}"):
            print("Train and evaluate")
            X, y = self.prepare_features(text_field, classification_field)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            mape = mean_absolute_percentage_error(y_test, y_pred)

            print(f'Mean Squared Error: {mse}')
            print(f'R-squared: {r2}')
            print(f'Mean Absolute Error: {mae}')
            print(f'Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%')

            # Log model parameters, metrics, and timestamp
            mlflow.log_params(self.model.get_params())
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2_score", r2)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mape", mape)

            # Save the trained model
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            mlflow.sklearn.log_model(self.model, f"model_{self.model_type}_{timestamp}")
            mlflow.set_tag("timestamp", timestamp)       


    def predict(self, text, genre_ids):
        # Preprocess the input data (similar to your existing predict method)
        processed_text = self.preprocess_text(text)
        text_embedding = self.get_embedding(processed_text)
        genre_ids_encoded = self.mlb.transform([genre_ids])
        X = np.hstack((text_embedding.reshape(1, -1), genre_ids_encoded))

        # Make prediction
        return self.model.predict(X)[0]

    def save_to_mlflow(self, model_name="movie_rating_predictor"):
        def predict_using_instance(model_input):
            text, genre_ids = model_input
            return self.predict(text, genre_ids)

        # Provide an input example using DataFrame
        input_example = pd.DataFrame({
            "overview": ["A thrilling action movie with breathtaking stunts."],
            "genre_ids": [[28, 12]]
        })

        # Save MultiLabelBinarizer to a file
        mlb_path = "mlb.pkl"
        with open(mlb_path, 'wb') as f:
            pickle.dump(self.mlb, f)

        # Log Word2Vec model directly as an artifact
        with mlflow.start_run(nested=True, run_name="word2vec_model"): 
            word2vec_model_path = "word2vec_model.bin"
            self.word2vec_model.save(word2vec_model_path)
            mlflow.log_artifact(word2vec_model_path) 

        artifacts = {
            'mlb': mlb_path,  
        }

        # Log word2vec_params directly
        mlflow.log_params(self.word2vec_params) 

        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=predict_using_instance,
            artifacts=artifacts,
            registered_model_name=model_name,
            input_example=input_example  # Correctly formatted input example
        )

In [2]:

def test():


    with mlflow.start_run(run_name="test_predictions"):
        model_uri = "models:/movie_rating_predictor/latest" 
        loaded_model = mlflow.pyfunc.load_model(model_uri)


        new_movie_overview = "A small team of scientists must race against time to stop what seems to be a cascade of global disasters signaling the possible apocalypse and end of days."
        new_movie_genre_ids = [878, 27] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (60): {predicted_rating}. Distance: {predicted_rating - 6.00}")


        new_movie_overview = "Heather bumps into Carla, having not spoken to her in years, and presents her with a very unexpected proposition that could change both of their lives forever."
        new_movie_genre_ids = [878, 27] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (58): {predicted_rating}. Distance: {predicted_rating - 5.80}")

        new_movie_overview = "In a futuristic dystopia with enforced beauty standards, a teen awaiting mandatory cosmetic surgery embarks on a journey to find her missing friend."
        new_movie_genre_ids = [878, 12] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (60): {predicted_rating}. Distance: {predicted_rating - 6.00}")


        new_movie_overview = "A talented martial artist who can't walk past a person in need unites with a probation officer to fight and prevent crime as a martial arts officer."
        new_movie_genre_ids = [28, 35, 80] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (79): {predicted_rating}. Distance: {predicted_rating - 7.90}")

        new_movie_overview = "A talented martial artist who can't walk past a person in need unites with a probation officer to fight and prevent crime as a martial arts officer."
        new_movie_genre_ids = [28, 35, 80] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (79): {predicted_rating}. Distance: {predicted_rating - 7.90}")

        new_movie_overview = "A detective begins to investigate a series of mysterious murders that are connected to a demonic book that brings dolls to life. As the body count begins to rise, the detective soon learns the curse of the demonic Friday and must find a way to stop it before any others disappear."
        new_movie_genre_ids = [27] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (32): {predicted_rating}. Distance: {predicted_rating - 3.20}")


        new_movie_overview = "When a group of ex-military members is hired to retrieve a lost bag of stolen money, their mission becomes more difficult after a lone hunter finds the bag first."
        new_movie_genre_ids = [28, 53, 10770] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (45): {predicted_rating}. Distance: {predicted_rating - 4.50}")

        new_movie_overview = "A young teenager named Mikey Walsh finds an old treasure map in his father's attic. Hoping to save their homes from demolition, Mikey and his friends Data Wang, Chunk Cohen, and Mouth Devereaux run off on a big quest to find the secret stash of Pirate One-Eyed Willie."
        new_movie_genre_ids = [12, 35, 10751] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (75): {predicted_rating}. Distance: {predicted_rating - 7.50}")


        new_movie_overview = "Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope."
        new_movie_genre_ids = [18, 80] 
        predicted_rating = loaded_model.predict((new_movie_overview, new_movie_genre_ids))
        print(f"Predicted rating for the new movie (87): {predicted_rating}. Distance: {predicted_rating - 8.70}")

In [3]:
def train(df,model_type):


    mlflow.end_run() 
    with mlflow.start_run(run_name=f"train_{model_type}"):
        # Example Usage:
        predictor = MovieRatingPredictor(
            df,
            word2vec_params={
                "vector_size": 100,
                "window": 10,
                "min_count": 5,
                "sg": 0,
                "workers": 8,
            },
            model_type_s=model_type,
            n_jobs=10,
        )

        # Train the word2vec model
        predictor.train_word2vec("overview")

        # Train and evaluate the main model
        predictor.train_and_evaluate("overview", "vote_average")

        # Save the entire predictor instance to MLflow
        predictor.save_to_mlflow()

In [4]:
#tmdb_token = os.getenv("TMDB_TOKEN")
#output_movies_folder = os.getenv("OUTPUT_MOVIES_FOLDER")
#mlflow_track_server = os.getenv("MLFLOW_TRACK_SERVER")



output_movies_folder="/mnt/projects/fiap-proj-int-03/app/get_movies/output_files/*.json"
tmdb_token = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI1NWFlZjk4ZWVkNTM1NWRmYmUyNGVjNmZiOWU2ZGZjOCIsIm5iZiI6MTcyNjk0NTM5Mi4zMzgyOCwic3ViIjoiNjZlZjE3NzI2YzNiN2E4ZDY0OGQzYWM1Iiwic2NvcGVzIjpbImFwaV9yZWFkIl0sInZlcnNpb24iOjF9.Y45bt_CPC9FnKCAPe4T2-nYnA3u48ZE6Hmm912zxzLs"
#output_movies_folder="C:/projects/fiap-proj-int-03/app/get_movies/output_files/*.json"


def get_data():
    conn = duckdb.connect(config = {'threads': 5})

    conn.execute(f"CREATE TABLE movies AS (SELECT * FROM read_json('{output_movies_folder}'));")

    #df = conn.execute(f"SELECT genre_ids, title, vote_average FROM read_json('{output_movies_folder}/*.json')").fetchdf()

    df = conn.execute(f"""
        SELECT 
            --*
            id
            , genre_ids
            , title
            , vote_average
            , overview
        FROM movies
        WHERE overview is not null 
        AND trim(overview) <> ''
                    
    """).fetchdf()

    print(f"number of lines: {df.count()}")

    return df

In [5]:
mlflow_track_server="http://user:pswd@10.96.132.192"
mlflow.set_tracking_uri(mlflow_track_server)    
mlflow.set_tag("model_version", "1.0")


df = get_data()
train(df, 'svr')
#train(df, 'decision_tree')
#train(df, 'random_forest')
#train(df, 'mlp')

number of lines: id              6616
genre_ids       6616
title           6616
vote_average    6616
overview        6616
dtype: int64


2024/09/28 17:57:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run debonair-worm-340 at: http://user:pswd@10.96.132.192/#/experiments/0/runs/25732d9f206b4366948e03e4f996a5bc.
2024/09/28 17:57:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://user:pswd@10.96.132.192/#/experiments/0.


Model Type: svr
Training Word2Vec


2024/09/28 17:57:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run word2vec_training at: http://user:pswd@10.96.132.192/#/experiments/0/runs/fb085894f2e8404a9bf3548c6e4ff3c4.
2024/09/28 17:57:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://user:pswd@10.96.132.192/#/experiments/0.


Train and evaluate
Prepare features
Mean Squared Error: 3.274420912367699
R-squared: 0.1587319881387559
Mean Absolute Error: 1.4242996702040533
Mean Absolute Percentage Error (MAPE): 37.55%


2024/09/28 17:58:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run train_and_evaluate_SVR at: http://user:pswd@10.96.132.192/#/experiments/0/runs/4099a41c5a7b4f43858e40ab8a9ae219.
2024/09/28 17:58:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://user:pswd@10.96.132.192/#/experiments/0.
2024/09/28 17:58:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run word2vec_model at: http://user:pswd@10.96.132.192/#/experiments/0/runs/b05ee44e515b4316a01b630622ac9141.
2024/09/28 17:58:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://user:pswd@10.96.132.192/#/experiments/0.
Registered model 'movie_rating_predictor' already exists. Creating a new version of this model...
2024/09/28 17:58:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: movie_rating_predictor, version 14
Created version '14' of model 'movie_rating_predictor'.
2024/09/28 17:58:1

In [6]:
#test()

In [7]:
import mlflow
mlflow_track_server="http://user:pswd@10.96.132.192"
mlflow.set_tracking_uri(mlflow_track_server)

# Create an MLflow client (adjust tracking URI if needed)
client = mlflow.tracking.MlflowClient()

# List all registered models 
registered_models = client.search_registered_models()

# Extract model URIs
model_uris = [model.latest_versions[0].source for model in registered_models]

print(model_uris) 

['mlflow-artifacts:/0/87b9cf6b95f748c8ad65f1e9ae7eaf53/artifacts/model', 'models:/teste_model/1']
