In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, linear_kernel

import seaborn as sns
import utils
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'utils'

In [2]:
df = pd.read_csv('final_cleaned.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'final_cleaned.csv'

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
y = df.genre.apply(lambda x: x.split(',')[0])
genre = y.str.replace(r'\s+', '')
df_dummy = pd.DataFrame()
df_dummy['genre'] = genre
genre_df = df_dummy.genre.value_counts().reset_index()
genre_df.rename(columns={'genre': 'count', 'index': 'genre'}, inplace=True)

plt.figure(figsize=(10,6))
plt.barh(genre_df['genre'], genre_df['count'])
plt.ylabel('Genre', fontsize=14)
plt.xlabel('Counts', fontsize=14)
plt.title('Number of Movies per Genre', fontsize=16)


In [None]:
df['cleaned_overview'] = df['about'].apply(lambda x: utils.process_doc(x))
df['genre'] = df_dummy['genre'].str.lower()

In [None]:
def combined_features(row):
    return row['cleaned_overview']+" "+row['genre']

df["combined_features"] = df.apply(combined_features, axis =1)
df["combined_features"].head(2)

In [None]:
# Apply the TFIDF to the data
TFIDF = TfidfVectorizer(min_df=10,lowercase=True)
tfidf = TFIDF.fit_transform(df["combined_features"])
feature_names = TFIDF.get_feature_names()
df1 = pd.DataFrame(tfidf.toarray(), columns=TFIDF.get_feature_names())
df1.head(2)

In [3]:
# Selecting the Number of Topics

To identify the rank, or number of components to use ideally, we want the smallest rank that minimizes the error. However, this rank may be too computationally expensive, as in this situation. We'll choose the rank by using the following method. First, calculate the frobenius norm of the dataframe and multiply it by .0001. This will be our benchmark value. Next, iterate through
rank = 3, 4, 5, .... For each iteration, run NMF using n_components=rank and reconstruct the matrix A. Calculate the root mean square error of the original dataframe and the reconstructed matrix A_k. When the RMSE is less than the benchmark value, we will stop and use the rank and the reconstructed matrix.

In [4]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse_all = []
num_topics = df1.shape[1]
for k in range(1,num_topics, 5):
    A = df1.copy()
    model = NMF(n_components=k, init='random', random_state=0)
    W = model.fit_transform(A)
    H = model.components_
    # get the reconstructed A with dimensions k
    A_k = W.dot(H)
    rmse_frob = mean_squared_error(A, A_k, squared=False)
    # getting reconstruction error (RMSE)
    rmse_all.append(rmse_frob)

    
from scipy import interpolate
x = np.arange(1, num_topics, 5)
rmse = rmse_all

#interpolate missing values or "gaps"
xnew = np.arange(1, num_topics)
f = interpolate.interp1d(x, rmse, fill_value="extrapolate")
rmse_new = f(xnew)

len(rmse_new)    

#setting the threshold 
frob_norm_A = np.linalg.norm(A, 'fro')
threshold = frob_norm_A*0.001
print(threshold)

In [None]:
rmse_bool = rmse_new < threshold
best_k = np.where(rmse_bool == True)[0][0] + 1

In [None]:
# Plot the interpolated reconstruction error curve 
plt.figure(figsize=(10,6))
plt.plot(np.arange(1,num_topics,5), rmse_all, 'o-')

plt.axvline(best_k, ls='--', color='g')
plt.title('Optimal n_components for topic modeling of Movie '
          'Overview ', fontsize=14)
plt.ylabel('Reconstruction Error', fontsize=12)
plt.xlabel('n_components\n', fontsize=12)
print(f"Min rec_err: {min(rmse_all)} at k: {(best_k)}", "\n")
plt.show()

# Simple Movie Recommender System

Recommendation Systems work based on the similarity between either the content or the users who access the content.
There are several ways to measure the similarity between two items. The recommendation systems use this similarity matrix to recommend the next most similar product to the user.

In this lab, we will build a machine learning algorithm that would recommend movies based on a movie overview and genre. We will use NMF to find two non-negative matrices (W, H) whose product approximates the non- negative matrix A. We will reconstruct the matrix using their dot product. This reconstructed matrix serves as a basis to the recommendation. We will the compute the Cosine Similarity from Sklearn, as the metric to compute the pairwise similarity scores between 2 movies.

In [None]:
n_topics = best_k
nmf_ = NMF(n_components=n_topics, max_iter=100).fit(tfidf)
nmf  = nmf_.transform(tfidf)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, sigmoid_kernel

In [None]:
Cosine similarity is a metric used to measure how similar two items are. Mathematically, it measures the cosine of the angle between two vectors projected in a multi-dimensional space. The output value ranges from 0–1.
0 means no similarity, where as 1 means that both the items are 100% similar.

In [None]:
cosine_sim = cosine_similarity(tfidf)

In [None]:
def get_top_recommended_movies(df, title, k, cosine_sim):
    """Return the index of the movie if found in the database, else 
    tell user to input another movie.
    """
    title = title.lower()
    if title in df.name.str.lower().unique():
        
        index = df[df.name.str.lower() == title].index[0]
        similar_movies_cos = list(enumerate(cosine_sim[index]))
        sorted_similar_movies_cos = sorted(similar_movies_cos,
                                       key=lambda x:x[1],
                                       reverse=True)[1:k+1]
        
        for movie_id, score in sorted_similar_movies_cos:
            print(df.iloc[movie_id]['name'])
#             print(df.iloc[movie_id]['about'])
#             print('\n')
    else:
        return "Movie not found in the database."

In [None]:
movie_user_likes = "The Godfather"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Casino royale"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "X-men"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Skyfall"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "The Dark Knight"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "The Shawshank Redemption"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Sailor Moon"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Ponyo"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_liks = "Ponyo"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)



In [None]:
movie_user_likes = "3 Idiots"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Mononoke-hime"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Harry Potter and the Sorcerer's Stone"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Finding Nemo"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Jurassic Park"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "The Wizard of Oz"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Gone Girl"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Rurôni Kenshin - Meiji kenkaku romantan"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Spider-Man"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
movie_user_likes = "Ed, Edd n Eddy"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

# Clustering

In [None]:
movie_user_likes = "The Blue Planet"
get_top_recommended_movies(df, movie_user_likes, 5, cosine_sim)

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, silhouette_score

from sklearn.base import clone
def cluster_range(X, clusterer, k_start, k_stop, actual=None):
    ys = []
    centers = []
    inertias = []
    chs = []
    scs = []
   
    for k in range(k_start, k_stop+1):
        clusterer_k = clone(clusterer)
        clusterer_k.set_params(n_clusters=k)
        clusterer_k.fit(X)
        y = clusterer_k.predict(X)
        ys.append(y)
        centers.append(clusterer_k.cluster_centers_)
        inertias.append(clusterer_k.inertia_)
        chs.append(calinski_harabasz_score(X, y))
        scs.append(silhouette_score(X, y))
       
    cluster_dict = {'ys': ys,
                    'centers': centers,
                    'inertias': inertias,
                    'chs': chs,
                    'scs': scs
                   }

    return cluster_dict

In [None]:
def plot_internal(inertias, chs, scs):
    """Plot internal validation values"""
    fig, ax = plt.subplots(figsize=(8,5))
    ks = np.arange(2, len(inertias)+2)
    ax.plot(ks, inertias, '-o', label='SSE')
    ax.plot(ks, chs, '-ro', label='CH')
    ax.set_xlabel('$k$')
    ax.set_ylabel('SSE/CH')
    lines, labels = ax.get_legend_handles_labels()
    ax2 = ax.twinx()
    
    ax2.plot(ks, scs, '-ko', label='Silhouette coefficient')
 
    lines2, labels2 = ax2.get_legend_handles_labels()
    plt.legend(lines+lines2, labels+labels2, bbox_to_anchor=(1.46,1))
    return ax

In [None]:
plot_internal(cluster_dict['inertias'], cluster_dict['chs'], 
                cluster_dict['scs'])


In [None]:
lsa = TruncatedSVD(n_components=2, random_state=1337)
X_new = lsa.fit_transform(df1.to_numpy())
kmeans_ng = KMeans(n_clusters=7, random_state=1337)
y_predict= kmeans_ng.fit_predict(X_new)
plt.figure(figsize=(10,8))
plt.scatter(X_new[:,0], X_new[:,1], c=y_predict)
plt.xlabel('SV1',fontsize=14)
plt.ylabel('SV2', fontsize=14)
plt.savefig('Kmeans_cluster', dpi=300)

In [None]:
df['cluster_label'] = y_predict
wiki = df[['name', 'cluster_label', 'votes']].sort_values(by='cluster_label')

In [None]:
from PIL import Image, ImageFont
from wordcloud import WordCloud

# mask2 = np.array(Image.open("face.png"))
# font_path = 'Charming-Regular.otf'
def similar_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h = 40 # 0 - 360
    s = 100 # 0 - 100
    l = random_state.randint(30, 70) # 0 - 100
    return "hsl({}, {}%, {}%)".format(h, s, l)


for k in range(0,7):
    
    df = wiki[wiki['cluster_label']==k].sort_values('votes',
                                          ascending=False)[:120]

    titles=df['name'].values
    d = dict(zip(titles, [1]*len(titles)))
    
    wordcloud = WordCloud(max_font_size=30, max_words=100, min_font_size=10,
                          background_color="white",
                        color_func=similar_color_func,
                       ).generate_from_frequencies(d)
    #show
    plt.figure(figsize=(10,10))
    plt.title('Cluster '+str(k), fontsize=16)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    
    plt.tight_layout()
    name_fig = 'cluster'+str(k)+'.png'
    plt.savefig(name_fig, dpi=300)
