# Algorithms

Study and comparison of different techniques and clustering algorithms.

In [1]:
# Libraries
import pandas as pd
import numpy as np

# for KNN:
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors

# for K approximations:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## KNN

In [3]:
# Read datasets
df_movies = pd.read_json("Data/datasets/final/movies_reduced_50.json", orient="records")
df_ratings = pd.read_json("Data/datasets/final/ratings_reduced_50.json", orient="records")

### Rudimentary KNN

In [4]:
knn_movies_basic = NearestNeighbors(
    algorithm="brute",              # by default with many dimensions.
    metric='minkowski',             # default: minkowski.
    n_neighbors=15,                 # Random number for this basic model.
    n_jobs=-1                       # Uses all processors.
)

In [5]:
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[df_movies.index].title))
}
# transform matrix to scipy sparse matrix
df_movies_sparse = csr_matrix(df_movies.values)

knn_movies_basic.fit(df_movies_sparse) # No va, no sé qué hay que cambiar.

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([    0,     2,     4,     8,    10,\n            ...\n            20299, 20300, 20301, 20302, 20303],\n           dtype='int64', length=17693). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

### KNN with K approximation

Different methods that the literature suggest will be used.

#### Elbow Method 

A more rudimentary method, [explanation]...

In [None]:
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,30), timings= True)
visualizer.fit(df_ratings)        # Fit data to visualizer
visualizer.show()        # Finalize and render the figure

#### Silhouette Coefficient method

[explanation]

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,30),metric='silhouette', timings= True)
visualizer.fit(df_ratings)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

#### DB Index

[explanation]

In [None]:
def get_kmeans_score(data, center):
    '''
    returns the kmeans score regarding Davies Bouldin for points to centers
    INPUT:
        data - the dataset you want to fit kmeans to
        center - the number of centers you want (the k value)
    OUTPUT:
        score - the Davies Bouldin score for the kmeans model fit to the data
    '''
    #instantiate kmeans
    kmeans = KMeans(n_clusters=center)
    # Then fit the model to your data using the fit method
    model = kmeans.fit_predict(df_ratings)
    
    # Calculate Davies Bouldin score
    score = davies_bouldin_score(df_ratings, model)

    return score
    
scores = []
centers = list(range(2,30))
for center in centers:
    scores.append(get_kmeans_score(df_ratings, center))
    
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('Davies Bouldin score');
plt.title('Davies Bouldin score vs. K');