In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('dataset/tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits,on='title')

In [4]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
movies.isnull().sum()
movies.dropna(inplace=True)

In [6]:
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [7]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [8]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter!=3:
            counter +=1 
            L.append(i['name'])
        else:
            break
    return L

In [9]:
movies['cast'] = movies['cast'].apply(convert3)

In [10]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [11]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [12]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [13]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(' ','') for i in x]) 
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(' ','') for i in x]) 
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(' ','') for i in x]) 
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(' ','') for i in x]) 

In [14]:
movies['tags'] = movies['overview']+movies['keywords']+movies['cast']+movies['crew']

In [15]:
new_df = movies[['movie_id','title','tags']]

In [16]:
new_df['tags'] = new_df['tags'].apply(lambda x:' '.join(x))

In [17]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [18]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [20]:
cv.fit_transform(new_df['tags']).toarray().shape

(4806, 5000)

In [21]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [22]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
#cv.get_feature_names()

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
similarty = cosine_similarity(vectors)

In [25]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarty[movie_index]
    movie_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[0:8]
    
    for i in movie_list:
        print(new_df.iloc[i[0]].title)
    

In [26]:
recommend('Avatar')

Avatar
Apollo 18
Aliens vs Predator: Requiem
Lifeforce
Battle: Los Angeles
Titan A.E.
Aliens
Independence Day


In [28]:
# get model in pickel
import pickle
import os

pickle.dump(new_df, open('model/movies_list.pkl','wb'))
print(os.path.getsize("model/movies_list.pkl"))

2296672


In [29]:
pickle.dump(similarty,open("model/similarity.pkl",'wb'))
print(os.path.getsize("model/similarity.pkl"))

184781251


In [30]:
# Compressing Data of pickel model
import bz2file as bz2 
pickle.dump(similarty,bz2.BZ2File("model/similarity.pkl",'wb'))
 
print(os.path.getsize("model/similarity.pkl"))

19049069


In [31]:
# pip install bz2file, patsy

In [None]:
# save this as app.py
from flask import Flask, request, render_template, request
import pickle
import bz2file as bz2 
import requests
import pandas as pd
from patsy import dmatrices

movies = pickle.load(open('model/movies_list.pkl', 'rb'))

similarity = pickle.load(bz2.BZ2File('model/similarity.pkl', 'rb'))


def fetch_poster(movie_id):
    url = 'https://api.themoviedb.org/3/movie/{}?api_key=b9093ccf3b2dedc32dd29d4b0b0bd00c&language=en-US'.format(movie_id)
    data = requests.get(url)
    data = data.json()
    poster_path = data['poster_path']
    full_path = "https://image.tmdb.org/t/p/w500/" + poster_path
    return full_path

def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse= True, key=lambda x: x[1])
    recommended_movies_name = []
    recommended_movies_poster = []
    for i in distances[0:8]:
        movie_id = movies.iloc[i[0]].movie_id
        recommended_movies_poster.append(fetch_poster(movie_id))
        recommended_movies_name.append(movies.iloc[i[0]].title)

    return recommended_movies_name, recommended_movies_poster

app = Flask(__name__)

@app.route('/')
def home():
    return render_template("index.html")

@app.route('/about')
def about():
    return render_template("about.html")

@app.route('/contact')
def contact():
    return render_template("contact.html")

@app.route('/recommendation', methods = ['GET', 'POST'])
def recommendation():
    movie_list = movies['title'].values
    status = False
    if request.method == "POST":
        try:
            if request.form:
                movies_name = request.form['movies']
                print(movies_name)
                recommended_movies_name, recommended_movies_poster = recommend(movies_name)
                print(recommended_movies_name)
                print(recommended_movies_poster)
                status = True

                return render_template("prediction.html", movies_name = recommended_movies_name, poster = recommended_movies_poster, movie_list = movie_list, status = status)




        except Exception as e:
            error = {'error': e}
            return render_template("prediction.html",error = error, movie_list = movie_list, status = status)

    else:
        return render_template("prediction.html", movie_list = movie_list, status = status)


if __name__ == '__main__':
    app.run()
    