# Algorithms

Study and comparison of different techniques and clustering algorithms.

In [21]:
# Libraries
import pandas as pd
import numpy as np
import json

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# for KNN:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors



# for K approximations:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## Encoding
Some data processing has to be done since there's noninal attributes that have to be encoded for the algorithms to work. One-hot encoding is tipically used for nominal attributes, but the computational power required to use it across tens of thousands of films is too big. Thus, we've opted for a label encoding, which would otherwise be more suitable for ordinal string attributes.

In [103]:
# We're mainly working with these smaller dataset.
df_movies = pd.read_json("Data/datasets/final/movies_reduced_25.json", orient="records")
df_ratings = pd.read_json("Data/datasets/final/ratings_reduced_25.json", orient="records")

In [104]:
# dict holding the column name and its respective encoder.
encoder_dict = {}

### Title

In [105]:
# Label encoding. 'le' variable is used for later decoding.
le = LabelEncoder()
df_movies['title'] = le.fit_transform(df_movies['title'])

# Add encoder to dict
encoder_dict['title'] = le

df_movies.head(2)

Unnamed: 0,movieId,title,genres,year,titleType,director,writer
0,148420,16,"[action, adventure]",2014,movie,David Gidali,J. Greg Abbott
1,171609,8315,[comedy],2017,movie,Giacomo Gentilomo,Gaspare Cataldo


### Genres

As we can see bellow, there are 20 different genres. With this smaller number we can try to apply the one-hot encoding method.

In [106]:
df_movies['genres'].explode().unique()

array(['action', 'adventure', 'comedy', 'fantasy', 'horror', 'drama',
       'romance', '(no genres listed)', 'children', 'film-noir', 'crime',
       'thriller', 'western', 'mystery', 'documentary', 'war', 'musical',
       'sci-fi', 'animation', 'imax'], dtype=object)

In [107]:
# One-hot encoder.
enc = OneHotEncoder(handle_unknown='ignore')

# Fitting requires a 2D array. Reshape converts our 1D array into 2D.
arr = np.array(df_movies['genres'].explode().unique()).reshape(-1,1)

enc.fit(arr)

# Add encoder to dict
encoder_dict['genres'] = enc

In [108]:
# Example of transformation and inverse transformation.
tr = enc.transform([['action'], ['adventure']]).toarray()
tr

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [109]:
enc.inverse_transform(tr)

array([['action'],
       ['adventure']], dtype=object)

In [110]:
# Applying one-hot encoding to each row in genres.

# 1st, define function to apply:
def one_hot_encode(x, enc):
    """
    Encodes X using one-hot encoding.

    Parameters
    ---------- 
    x : 1D list of parameters to encode.
    enc : already defined one-hot encoder.

    Returns
    -------
    2D array containing the encoding.
    """

    return enc.transform(np.array(x).reshape(-1,1)).toarray()

# Now that we're at it, a decoding function should be defined as well.
def one_hot_decode(x, enc):
    """
    Decodes X 2D vector using one-hot encoding.

    Parameters
    ---------- 
    x : 2D vectori with previously encoded parameters.
    enc : already defined one-hot encoder.

    Returns
    -------
    1D array containing the decoded parameters.
    """
    return np.array(enc.inverse_transform(x)).flatten()


In [111]:
df_movies['genres'] = df_movies['genres'].apply(one_hot_encode, enc=enc)

### Movie type

Only 4 movie types exist, so one-hot encoding will be used.

In [115]:
len(df_movies['titleType'].explode().unique())

4

In [116]:
# One-hot encoder.
enc = OneHotEncoder(handle_unknown='ignore')

# Fitting requires a 2D array. Reshape converts our 1D array into 2D.
arr = np.array(df_movies['titleType'].explode().unique()).reshape(-1,1)

enc.fit(arr)

# Add encoder to dict
encoder_dict['titleType'] = enc

In [117]:
df_movies['titleType'] = df_movies['titleType'].apply(one_hot_encode, enc=enc)

### Directors

There are ~7k directors, so Label encoding seems to be more appropiate.

In [112]:
len(df_movies['director'].explode().unique())

6881

In [113]:
le = LabelEncoder()
df_movies['director'] = le.fit_transform(df_movies['director'])

# Add encoder to dict
encoder_dict['director'] = le

df_movies.head(2)

Unnamed: 0,movieId,title,genres,year,titleType,director,writer
0,148420,16,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2014,movie,1360,J. Greg Abbott
1,171609,8315,"[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,...",2017,movie,2174,Gaspare Cataldo


### Writers

There are ~8k directors, so Label encoding will be used.

In [None]:
len(df_movies['writer'].explode().unique())

8130

In [114]:
le = LabelEncoder()
df_movies['writer'] = le.fit_transform(df_movies['writer'])

# Add encoder to dict
encoder_dict['writer'] = le

df_movies.head(2)

Unnamed: 0,movieId,title,genres,year,titleType,director,writer
0,148420,16,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2014,movie,1360,3192
1,171609,8315,"[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,...",2017,movie,2174,2533


In [124]:
# Checking that all encoders have been added to the dict
encoder_dict

{'title': LabelEncoder(),
 'genres': OneHotEncoder(handle_unknown='ignore'),
 'director': LabelEncoder(),
 'writer': LabelEncoder(),
 'titleType': OneHotEncoder(handle_unknown='ignore')}

Once all the data has been properly encoded, different algorithms will be tried.

## KNN

In [9]:
# Read datasets
df_movies = pd.read_json("Data/datasets/final/movies_reduced_25.json", orient="records")
df_ratings = pd.read_json("Data/datasets/final/ratings_reduced_25.json", orient="records")

### Rudimentary KNN

In [125]:
knn_movies_basic = NearestNeighbors(
    algorithm="brute",              # by default with many dimensions.
    metric='minkowski',             # default: minkowski.
    n_neighbors=15,                 # Random number for this basic model.
    n_jobs=-1                       # Uses all processors.
)

In [127]:
# transform matrix to scipy sparse matrix
#df_movies_sparse = csr_matrix(df_movies.values)

knn_movies_basic.fit(df_movies) # No va, no sé qué hay que cambiar.

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### KNN with K approximation

Different methods that the literature suggest will be used.

#### Elbow Method 

A more rudimentary method, [explanation]...

In [7]:
model = KMeans()
visualizer = KElbowVisualizer(
    model,
    k=(2,30),                       # k is range of number of clusters.
    timings= True
    )

visualizer.fit(df_movies)           # Fit data to visualizer
visualizer.show()                   # Finalize and render the figure

ValueError: could not convert string to float: '10.0 Earthquake'

#### Silhouette Coefficient method

[explanation]

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,30),metric='silhouette', timings= True)
visualizer.fit(df_ratings)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

#### DB Index

[explanation]

In [None]:
def get_kmeans_score(data, center):
    '''
    returns the kmeans score regarding Davies Bouldin for points to centers
    INPUT:
        data - the dataset you want to fit kmeans to
        center - the number of centers you want (the k value)
    OUTPUT:
        score - the Davies Bouldin score for the kmeans model fit to the data
    '''
    #instantiate kmeans
    kmeans = KMeans(n_clusters=center)
    # Then fit the model to your data using the fit method
    model = kmeans.fit_predict(df_ratings)
    
    # Calculate Davies Bouldin score
    score = davies_bouldin_score(df_ratings, model)

    return score
    
scores = []
centers = list(range(2,30))
for center in centers:
    scores.append(get_kmeans_score(df_ratings, center))
    
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('K');
plt.ylabel('Davies Bouldin score');
plt.title('Davies Bouldin score vs. K');