In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# reading datasets
credits_df = pd.read_csv("credits.csv")
movies_df = pd.read_csv("movies.csv")

# combine both datasets together
movies_df = movies_df.merge(credits_df, on="title")

# dropping columns
movies_df = movies_df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [3]:
# remove na rows
movies_df.dropna(inplace = True)

In [4]:
# checking the genre column
movies_df.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [5]:
# checking the keywords column
movies_df.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [6]:
# abstract syntax tree
import ast

In [7]:
# define function to convert the genres and keyword columns to only contain the name, in a string/list form, not dict
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i["name"])
    return list

In [8]:
# converting the columns
movies_df["genres"] = movies_df["genres"].apply(convert)
movies_df["keywords"] = movies_df["keywords"].apply(convert)

In [9]:
# define function to convert the cast columns : only display the top 3 casts
def convert_cast(obj):
    list = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            list.append(i["name"])
        else:
            break
            
    return list

In [10]:
# convert the cast column
movies_df["cast"] = movies_df["cast"].apply(convert_cast)

In [11]:
# define function to convert crew column : we only want to keep the name of the crew who is a director
def convert_crew(obj):
    list = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            list.append(i["name"])
            break
            
    return list

In [12]:
# convert the crew column
movies_df["crew"] = movies_df["crew"].apply(convert_crew)

In [13]:
# separate the overview sentences into elements in a list
movies_df['overview'] = movies_df['overview'].apply(lambda x : x.split())

In [14]:
# remove spacing between words in the same element
movies_df['genres'] = movies_df['genres'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x :[i.replace(" ", "") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x :[i.replace(" ", "") for i in x])

In [15]:
# create new column by adding the columns together
movies_df['tags'] = movies_df['overview']+movies_df['genres']+movies_df['keywords']+movies_df['cast']+movies_df['crew']
movies_df['tags'] = movies_df['tags'].apply(lambda x : " ".join(x))
movies_df['tags'] = movies_df['tags'].apply(lambda x : x.lower())
# movies_df['title'] = movies_df['title'].apply(lambda x : x.lower())

In [16]:
# create a df for genres to get list of all genres
genres_df = pd.DataFrame(movies_df['genres'])
genres_set = set()
for genres_list in genres_df['genres']:
    genres_set.update(genres_list)

genres = list(genres_set)

In [17]:
genres_dict = {
    'Crime': '🕵️‍♂️',
    'Action': '💥',
    'Western': '🤠',
    'Mystery': '🔍',
    'Foreign': '🌍',
    'War': '⚔️',
    'ScienceFiction': '🚀',
    'Documentary': '📹',
    'Romance': '💕',
    'Fantasy': '🧙‍♂️',
    'Music': '🎵',
    'History': '📜',
    'Thriller': '🎬',
    'Adventure': '🌄',
    'Horror': '👻',
    'Family': '👨‍👩‍👧‍👦',
    'Comedy': '😂',
    'TVMovie': '📺',
    'Drama': '🎭',
    'Animation': '🎞️'
}

In [18]:
# dropping all the duplicated columns
new_df = movies_df[['movie_id', 'title', 'tags']]

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = "english")
# transforming the tags column to an array recognisable by machine 
vectors = cv.fit_transform(new_df['tags']).toarray()

In [20]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
# define the stem method to reduce all the words in the tag column into their root word, using the stem method from ps
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [22]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
similarity = cosine_similarity(vectors)

In [25]:
# create the recommendation system
def recommend(movie):
    try:
        # finding the location of the input movie
        movie_index = new_df[new_df['title'].str.lower() == movie.lower()].index[0]
        # extract the vector representing the input movie in the similarity matrix
        # this vector represent the similarity score of all the other movies as compared to the input movie
        distances = similarity[movie_index]
        # sort and get the top 6
        movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
        
        recommendations = []
        for i in movies_list:
            recommendations.append(new_df.iloc[i[0]].title)
        
        return recommendations
    except IndexError:
        return []

In [26]:
# get genres
def get_genre(movie):
    genres_list = movies_df[movies_df['title'] == movie]['genres']
    if not genres_list.empty:
        genres = genres_list.iloc[0]
        emojis = ""
        for genre in genres:
            emoji = genres_dict.get(genre)
            if emoji:
                emojis += emoji
        return emojis
    else:
        return ""

In [27]:
from telegram import Update, ForceReply
from telegram.ext import Application, CommandHandler, MessageHandler, filters, CallbackContext
import logging
import nest_asyncio
import asyncio

In [28]:
async def start(update: Update, context: CallbackContext):
    user = update.effective_user
    await update.message.reply_html(
        rf"Hi {user.mention_html()}! I am your movie recommendation bot. Type the name of a movie to get recommendations.",
        reply_markup=ForceReply(selective=True),
    )


In [29]:
async def recommend_movie(update: Update, context: CallbackContext):
    user_input = update.message.text
    recommendations = recommend(user_input)
    if len(recommendations) > 1:
        emoji = get_genre(user_input)
        response = f"Here are some recommendations for <b>{user_input}</b> {emoji}:\n"
        for i, movie in enumerate(recommendations, start=1):
            emoji_indiv = get_genre(movie)[0]
            response += f"{i}. {movie} {emoji_indiv}\n"
    else:
        response = f"Sorry, your movie <b>{user_input}</b> was not found in our database.. Make sure that your capitalisations and punctuations are correct"
    await update.message.reply_text(response, parse_mode='HTML')

In [30]:
async def main():
    application = Application.builder().token("7171834170:AAHG4h8PpdXmJ1JWatUC1vZR-9c7K1zD26E").build()

    # Register command handlers
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, recommend_movie))

    # Start the bot
    await application.run_polling()

In [None]:
import nest_asyncio
import asyncio
if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(main())