# Modelos de Machine Learning

- Carregamento dos dados de treino e teste
- Criação das matrizes de treino e teste
- Métodos de recomendação
    - Função de apoio
    - Most-Popular
    - Best-Rated
    - PureSVD

In [122]:
import operator
from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix, linalg
from sklearn.model_selection import train_test_split

## Carregamento dos dados de treino e teste

In [123]:
def read_train_and_test_df(file_name, test_size = 0.3):
    r"""Reads training and tests data from file.

    Args:
        file_name (string): File name containing data frame.
        test_size (float, optional): Percent of test data. Defaults to 0.3.

    Returns:
        tuple: Returns training and test data frames.
    """
    df = pd.read_csv(file_name)    
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=1)

    return df_train, df_test

In [124]:
# Reading train and test data frames
df_train, df_test = read_train_and_test_df('data/applications-by-terminals.csv')

## Criação das matrizes de treino e teste

In [125]:
def create_scores_matrix(df):
    r"""Creates a compressed sparse row matrix.

    Args:
        df (DataFrame): Data frame containing rows, columns and values of csr_matrix. 

    Returns:
        array: Returns a compressed sparse row matrix.
    """
    # Select terminals, applications and scores logs 
    # (i.e., all information from each column)

    terminals = df['Terminal']
    applications = df['ApplicationId']
    scores = df['Score']

    # Define the matrix dimensions based on the max index 
    # related to terminals and applications

    nb_terminals = max(terminals)
    nb_applications = max(applications)

    # Creating train matrix of scores
    # csr_matrix = compressed sparse row matrix
    scores_matrix = csr_matrix((scores, (terminals, applications)),  shape=(nb_terminals+1, nb_applications+1))

    return scores_matrix

### Matriz de treino

In [126]:
# Creating train matrix of scores
scores_matrix_train = create_scores_matrix(df_train)

### Matriz de teste

In [127]:
# Creating test matrix of scores
scores_matrix_test = create_scores_matrix(df_test)

## Métodos de recomendação

### Função de apoio

In [128]:
def save_recommendation(file_name, recommendation, terminals_targets):
    r"""This function is used to save the recommendations in a file.

    Args:
        file_name (string): Recommendation file path.
        recommendation (dict): Dictionary of recommended applications.
        terminals_targets (array): Array of terminals.
    """

    file_out = open(file_name, 'w')
    
    # for each user target
    for terminal in terminals_targets:
        issuedApplications = ""
        #if terminal in recommendation:
        # for each item in the previous order
        for applicationId in recommendation[terminal]:
            issuedApplications += str(applicationId) + ":" + str(0.0) + ","
        # saving in file in correct format
        string_s = str(terminal) + "\t" + "[" + issuedApplications
        string_out = string_s[:-1] + ']'
        file_out.write(string_out + "\n")
    
    file_out.close()

### _Most-Popular_

In [129]:
def predict_using_most_popular(scores_matrix):
    r"""Retrieves the most popular applications between terminals.

    Args:
        scores_matrix (array): Array of score values.

    Returns:
        array: Returns the most popular applications between terminals.
    """
    # Items popularity
    items_popularity = {}
    for i in range(scores_matrix.shape[1]):
        items_popularity[i] = scores_matrix[:,i].count_nonzero()

    #  Sorting the ApplicationId descending by its popularity
    popularity_order = OrderedDict(sorted(items_popularity.items(), key=operator.itemgetter(1), reverse=True))
    most_popular = list(popularity_order.keys())

    return most_popular

def recommend_using_most_popular(scores_matrix, most_popular, top = 10):
    r"""Recommend items based in the most popular recommendation.
    The recommendation is related to the most popular applications between the terminals.

    Args:
        scores_matrix (array): Array of score values.
        best_rated (array): Array of the most popular applications between the terminals.
        top (int, optional): Size of each recommendation. Defaults to 10.

    Returns:
        dict: Returns top n recommended applications.
    """       
    # Setting the recommendations of items that have not be rated by the terminal
    recommendation = {}

    for u in range(scores_matrix.shape[0]):
        # recommending just the most popular
        recommendation[u] = []
        cont = 0
        # recommending the most popular that have never seen by terminals
        for i in most_popular:
            # recommending the top-k items 
            if (cont < top):
                if (scores_matrix[u, i]==0):
                    recommendation[u].append(i)
                    cont += 1
            else:
                break

    return recommendation


### Gerando lista de recomendados

In [130]:
# Most-Popular Recommendation
most_popular = predict_using_most_popular(scores_matrix_train)
recommendation = recommend_using_most_popular(scores_matrix_train, most_popular)

recommendation[10]

[144, 160, 164, 150, 161, 146, 149, 154, 152, 162]

### Salvando recomendação

In [131]:
# Save in a file
terminals_targets = df_test['Terminal'].unique() 
save_recommendation('data/recList_MostPopular.txt', recommendation, terminals_targets)

## _Best-Rated_

In [132]:
def predict_using_best_rated(scores_matrix):
    r"""Retrieves the most popular items between terminals based in the mean of applications' scores.

    Args:
        scores_matrix (array): Array of score values.

    Returns:
        array: Returns the best rated items between terminals based in the mean of applications' scores.
    """    
    # Measure the mean of items' scores
    items_score = {}
    for i in range(scores_matrix.shape[1]):
        items_score[i] = np.mean(scores_matrix[:,i])

    # Sorting the itemId descending by its popularity
    score_order = OrderedDict(sorted(items_score.items(), key=operator.itemgetter(1), reverse=True))
    best_rated = list(score_order.keys())

    return best_rated

def recommend_using_best_rated(scores_matrix, best_rated, top = 10):
    r"""Recommend items based in best rated recommendation.
    The recommendation is related to the best rated applications based in the mean of applications' scores.

    Args:
        scores_matrix (array): Array of score values.
        best_rated (array): Array of the best rated applications between terminals.
        top (int, optional): Size of each recommendation. Defaults to 10.

    Returns:
        dict: Returns top n recommended applications.
    """    
    # Setting the recommendations of items that have not be rated by the terminal
    recommendation = {}

    for u in range(scores_matrix.shape[0]):
        # recommending just the most popular
        recommendation[u] = []
        cont = 0
        # recommending the most popular that have never seen by users
        for i in best_rated:
            # recommending the top-k items 
            if (cont < top):
                if (scores_matrix[u, i]==0):
                    recommendation[u].append(i)
                    cont += 1
            else:
                break

    return recommendation

### Gerando lista de recomendados

In [133]:
# Best-Rated Recommendation
best_rated = predict_using_best_rated(scores_matrix_train)
recommendation = recommend_using_best_rated(scores_matrix_train, best_rated)

recommendation[10]

[144, 160, 146, 153, 164, 161, 154, 150, 163, 162]

### Salvando recomendação

In [134]:
# Save in a file
terminals_targets = df_test['Terminal'].unique()
save_recommendation('data/recList_BestRated.txt', recommendation, terminals_targets)

## _PureSVD_

Dada a matriz de scores, aplicamos o SVD para extrair três matrizes:
    - U representa os fatores do alvo (terminal) (m x f)
    - S os valores próprios associados a cada vetor próprio (f x f)
    - Q representa os fatores dos itens (f x n)

A predição segue a fórmula dada por: 

$
\begin{align}
\widehat{r}_{ui} = r_u \cdot Q^T \cdot q_i
\end{align}
$    

In [135]:
def predict_using_pure_svd(scores_matrix, num_factors = 3):
    r"""In PureSVD model, the prediction is based on the latent factors 
    extracted via SVD.

    - Let a scores matrix, we apply the SVD to extract three matrices:
        - *U* represents the users factors *(m x f)*
        - *S* the eigenvalues associated to each eigenvector *(f x f)*
        - *Q* represents the items factors *(f x n)*

    - The prediction is similar to:
    $
    \begin{align}
    \widehat{r}_{ui} = r_u \cdot Q^T \cdot q_i
    \end{align}
    $    

    Args:
        scores_matrix (array): Array of score values.
        num_factors (int, optional): Define the number of latent factors and use it to run the SVD method. Defaults to 3.

    Returns:
        [array]: Returns the matrix with PureSVD prediction.
    """

    [U, S, Q_t] = scipy.sparse.linalg.svds(scores_matrix, num_factors, return_singular_vectors=True)

    # Predict scores for each terminal-application based on the PureSVD rules.
    prediction_matrix = csr_matrix((scores_matrix.shape[0], scores_matrix.shape[1]))

    Q = Q_t.transpose()

    aux_matrix = scores_matrix.dot(Q)
    prediction_matrix = aux_matrix.dot(Q_t)

    return prediction_matrix

def recommend_using_pure_svd(scores_matrix, prediction_matrix, top = 10):
    r"""Recommend items based in PureSVD recommendation.
    The recommendation is related to the cosine similarity of 
    terminals and applications vectors.

    Args:
        scores_matrix (array): Array of score values.
        prediction_matrix (array): Array of terminal and applications coeficient correlations.
        top (int, optional): Size of each recommendation. Defaults to 10.

    Returns:
        dict: Returns top n recommended applications.
    """
    
    # Setting the recommendations for each terminal
    recommendation = {}

    for terminal in range(scores_matrix.shape[0]):
        recommendation[terminal] = []
        cont = 0
        # sorting items by relevance
        orderedApps = np.argsort(prediction_matrix[terminal,:])[::-1]
        # recommending the best applications that have never seen by terminals
        for app in orderedApps:
            # recommending the top-k items 
            if (cont < top):
                if (scores_matrix[terminal,app] == 0):
                    recommendation[terminal].append(app)
                    cont += 1
            else:
                break
    
    return recommendation

### Gerando lista de recomendados

In [136]:
# PureSVD Recommendation
prediction_matrix = predict_using_pure_svd(scores_matrix_train)
recommendation = recommend_using_pure_svd(scores_matrix_train, prediction_matrix)

recommendation[10]

[221, 69, 80, 79, 78, 77, 76, 75, 74, 73]

### Salvando recomendação

In [137]:
# Save in a file
terminals_targets = df_test['Terminal'].unique()
save_recommendation('data/recList_PureSVD.txt', recommendation, terminals_targets)