In [1]:
import numpy as np
import pandas as pd
import ast
import re

from src.utils import clean_meta_data, clean_keywords, clean_movies
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
metaData = clean_meta_data(pd.read_csv("../../data/movies_metadata.csv", low_memory=False))
keywords = clean_keywords(pd.read_csv("../../data/keywords.csv", low_memory=False))
users = pd.read_csv("../../data/users.dat", sep="::",
                    names=["UserId", "Gender", "AgeGroup", "Occupation", "Zip"])
movies = clean_movies(pd.read_csv("../../data/movies.dat", sep="::",
                     names=["MovieId", "Title", "Genres"]))
userRatings = pd.read_csv("../../data/training.csv", low_memory=False).drop(columns=["timestamp"])

In [None]:
metaData.head()

In [None]:
keywords.head()

In [None]:
users.head()

In [None]:
movies.head()

In [None]:
userRatings.head()

In [None]:
movieId = 2
print(keywords[keywords["MovieId"] == movieId])
print(movies[movies["MovieId"] == movieId])
print(metaData[metaData["MovieId"] == 2])

In [None]:
metaData.info()
metaDataKeyWords = metaData.merge(keywords.drop(columns=["MovieId"]), on="id")
metaDataKeyWords.head()

In [None]:
metaData.head()

In [None]:
vectorizer = CountVectorizer(
    max_df=0.85,
    min_df=2,
    max_features=1000,
    ngram_range=(1,3),
)

docTermMatrix = vectorizer.fit_transform(metaDataKeyWords["keywords"])
corpus = vectorizer.get_feature_names()

In [None]:
lda = LatentDirichletAllocation(
    n_components=20,
    doc_topic_prior=0.9,
    topic_word_prior=0.9,
    max_iter=10,
    learning_offset=50.,
    learning_method="online",
    random_state=0,
    n_jobs=-1,
    verbose=1
)

lda.fit(docTermMatrix)

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, corpus, num_top_words)

In [None]:
topicProbabilities = lda.transform(docTermMatrix)
topicProbabilities.shape

In [None]:
cosineDistances = cosine_distances(topicProbabilities)
cosineDistancesIndices = cosineDistances.argsort(axis=1)

In [None]:
cosineDistancesIndices[0][:10]

In [None]:
list(metaDataKeyWords[metaDataKeyWords["MovieId"] == 1]["genres"])

In [None]:
list(metaDataKeyWords[metaDataKeyWords["MovieId"] == 30982]["genres"])

In [None]:
movies.info()