In [None]:
import pandas as pd
import numpy as np

In [None]:
df1=pd.read_csv('/content/Top_10000_Movies_IMDb.csv')

In [None]:
df=df1.copy()

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width',None)
pd.set_option('display.max_colwidth',None)

In [None]:
df.head()

##About Dataset
This dataset consists of a meticulously collected collection of 10,000 feature films from IMDb, one of the most popular and authoritative sources for movie information. The movies included in this dataset are sorted based on their IMDb ratings in descending order. The dataset covers a wide range of genres, directors, and stars, providing a comprehensive overview of highly regarded films across various categories.The scraping process was performed on June 17, 2023.

**[Dataset_Link](https://www.kaggle.com/datasets/moazeldsokyx/imdb-top-10000-movies-dataset)**

**Dataset Columns:**

- ID: Unique identifier for each movie in the dataset.
- Movie Name: The title of the movie.
Rating: The IMDb rating for the movie.
- Runtime: The duration of the movie in minutes.
- Genre: The genre(s) to which the movie belongs.
- Metascore: The Metascore rating for the movie (if available).
- Plot: A brief summary or description of the movie's plot.
- Directors: The director(s) of the movie.
- Stars: The main cast or actors featured in the movie.
- Votes: The number of votes/ratings received by the movie.
- Gross: The gross revenue generated by the movie (if available).
- Link: The IMDb link to access the full details and additional information about the movie.

In [None]:
df.head()

- By seeing the data and visiting the website i realized that there some more columns that we can fetch , that will be helpfull for model building and analysis.
- we are going to scrape some data from the website

# Data Scraping
- We are going to scrape "Year of release","Writers name" and "links of the poster"
- "year of release"- for analysis purpose
- "Writer Names"- for recommendation
- "link of poster"- for frontend of app

In [None]:
import requests
from bs4 import BeautifulSoup
#Install important libraries

In [None]:
#Make header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}


In [None]:
#function for scraping the data
year_list=[]
writer_list=[]
img_link=[]
def extract_info(url):
  response = requests.get(url,headers=headers)
  html_content = response.text
  soup = BeautifulSoup(html_content, 'html.parser')
  year=soup.select("a.ipc-link")
  if year[5].text:
    year_list.append(year[5].text)
  else:
    year_list.append(np.nan)
  writers=soup.select("ul .ipc-inline-list")
  temp_list=[]
  for i in writers[1]:
    temp_list.append(i.text)
  if len(temp_list)!=0:
    writer_list.append(temp_list)
  else:
    writer_list.append(np.nan)
  img=soup.find("img",class_="ipc-image")
  if img:
    img_link.append(img.get("src"))
  else:
    img_link.append(np.nan)


In [None]:
df["Link"].apply(extract_info)

## Run at your own risk
- There are 10000 rows in the data , so it take me 8hrs to scrape the data.
- if you still want to scrape just do it in batches.


In [None]:
temp_dict={
      "year":year_list,
      "writer":writer_list,
      "img_link":img_link
    }
temp_df=pd.DataFrame(temp_dict)
temp_df.to_csv("scraped_data.csv",index=False)

In [None]:
df1=pd.read_csv("/content/scraped_data.csv")

In [None]:
concated_data=pd.concat([df,df1],axis=1)

In [None]:
concated_data.to_csv("comp_df.csv",index=False)

# Data Preprocessing

In [None]:
%pwd

In [None]:
import pandas as pd

In [None]:
df=concated_data.copy()

In [None]:
df.head()

In [None]:
df[df["Votes"]==df["Gross"]].shape

In [None]:
df["year"].value_counts()

In [None]:
df["writer"].sample(100)
df["writer"].head()
df["writer"].tail()

In [None]:
df["img_link"].sample(100)
df["img_link"].head()
df["img_link"].tail()

In [None]:
df.info()

## Observations by seeing the data
- "id" column is of no use we can drop it
- rename the "Movie Name" column to "title"
- change all the column names to lowercase (for easy typing)
- remove the min suffix from the "Runtime" column
- Genre column contains multiple categories in a single row try to split it and make new column for every category helps in visualization and model building
- some Nan values in metascore
- Directors column contain directors name as well as stars name . we have to correct that and check if it is true for all columns
- 2831 values are common in votes and gross
- some values in year column are not year

In [None]:
# Drop "ID" column
df.drop("ID",axis=1,inplace=True)


In [None]:
#rename the "Movie Name" column to "title"
#change all the column names to lowercase (for easy typing)
new_names={
    "Movie Name":"title",
    "Rating":"rating",
    "Votes":"vote",
    "Directors":"director",
    "Stars":"star",
    "Metascore":"metascore",
    "Genre":"genre",
    "Plot":"plot",
    "Runtime":"runtime",
    "Gross":"gross",
    "Link":"link"

}

df.rename(columns=new_names,inplace=True)


In [None]:
#remove the min suffix from the "Runtime" column
df["runtime"]=df["runtime"].str.replace(" min","")

In [None]:
#Checking if all the values are correct
df["runtime"].value_counts()

In [None]:
#Genre column contains multiple categories in a single row try to split it and make new column for every category helps in visualization and model building
#check what is the max no of genre in a movie

df["genre"]=df["genre"].str.split(",")

In [None]:
l=[]
for i in df["genre"]:
  l.append(len(i))
df["no_of_genre"]=l

In [None]:
df["no_of_genre"].value_counts()

- so there are max 3 genres in a movie
- now lets calculate all types of genres and make seprate columns for each of them

In [None]:
set_of_genre=set()
for i in df["genre"]:
  for j in i:
    set_of_genre.add(j.strip().lower())

list_of_genre=list(set_of_genre)


In [None]:
print(list_of_genre)

In [None]:
print(len(list_of_genre))

- there are 21 types of genres , lets see what we can do with them

In [None]:
#some Nan values in metascore
#lets see how much nan values are there
df["metascore"].isnull().sum()

- 2007 values are null

In [None]:
#Directors column contain directors name as well as stars name . we have to correct that and check if it is true for all columns
df[["director","star"]]

- our speculation is correct.
- now lets keep the items of "director" column which are not in "star"

In [None]:
# the problem is both "director" and "star" column cantains string
#we have to convert it in list using literal eval
import ast
def str_lst(x):
  return ast.literal_eval(x)

In [None]:
df["director"]=df["director"].apply(str_lst)
df["star"]=df["star"].apply(str_lst)

In [None]:
df["director"]=[i[0] for i in df["director"]]



In [None]:
df["old_director"]=list_of_directors

In [None]:
df["no_of_directors"]=df["director"].apply(len)

In [None]:
df["no_of_directors"].value_counts()

In [None]:
df["director"]=df["director"].apply(lambda x:x[0])

In [None]:
df["director"].value_counts()

- we extract the directors

In [None]:
df["star"][0]

In [None]:
l=[]
for i in df["star"]:
  l.append(len(i))
df["len_of_star"]=l

In [None]:
df["len_of_star"].value_counts()

In [None]:
df[df["len_of_star"]==0]

- there are three movies which dont have any stars
- two are animation films and one movie got [Sara Cushman, Don Hertzfeldt] these stars which i fill manually and delete the other ones

In [None]:
df["star"][287]=["Sara Cushman", "Don Hertzfeldt"]

In [None]:
df["star"][287]

- didn't know that inserting a list to a dataframe is such a pain

In [None]:
df.drop([13,27],inplace=True)

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
df.shape

In [None]:
df.drop(["len_of_star","no_of_genre","no_of_directors"],axis=1,inplace=True)

- 2831 values are common in votes and gross.
- we are not going to use these columns in our recommendations so its not a problem
- i think these columns are usefull if we build gross prediction so there we are going to drop it.


## some values in year column are not year

In [None]:
df["year"].value_counts()

In [None]:
df["year"]=df["year"].astype(str)

In [None]:
df=df[df["year"].str.isdigit()]

In [None]:
df["year"]=df["year"].astype(int)

In [None]:
df=df[df["year"]>1800]

In [None]:
df.shape

In [None]:
df.to_csv("preprocessed_data.csv",index=False)

# Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

In [None]:
temp_df=pd.read_csv("/content/preprocessed_data.csv")
df=temp_df.copy()

In [None]:
# list of columns that we want to keep
#["title","genre","plot","director","star","year","writer"]

In [None]:
df=df[["title","genre","plot","director","star","year","writer"]]

In [None]:
import ast
def str_lst(x):
  return ast.literal_eval(x)

In [None]:
df["genre"]=df["genre"].apply(str_lst)
df["star"]=df["star"].apply(str_lst)
df["writer"]=df["writer"].apply(str_lst)

## Vectorization
- plot: word2vec
- ["genre","director","star","year","writer"]: Count Vectorizer

# Prepare the columns

In [None]:
def prepare_list(x):
  l=[]
  for i in x:
    l.append(i.lower().replace(" ",""))
  return " ".join(l)

In [None]:
for col in ["genre","star","writer"]:
  df[col]=df[col].apply(prepare_list)

In [None]:
def puntuation_remover(x):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for ele in x:
    if ele in punc:
      x = x.replace(ele, " ")
  return x

In [None]:
df["plot"]=df["plot"].apply(puntuation_remover)

In [None]:
def prepare_words(x):
  l=[]
  ls=x.split(" ")
  for i in ls:
    l.append(i.lower())
  return " ".join(l)


In [None]:
df["plot"]=df["plot"].apply(prepare_words)

In [None]:
df["plot"][0]

In [None]:
df["director"]=df["director"].str.replace(" ","").str.lower()

In [None]:
# Manually process Word2Vec for the "plot" column
plot_sentences = [plot.split() for plot in df['plot']]
word2vec_model = Word2Vec(sentences=plot_sentences, vector_size=100, window=5, min_count=1)
plot_vectors = np.array([np.mean([word2vec_model.wv[word] for word in words], axis=0) for words in plot_sentences])


In [None]:
# 1. CountVectorizer on genre, director, star and writer
# Initialize separate CountVectorizer instances for each column
genre_vectorizer = CountVectorizer()
director_vectorizer = CountVectorizer()
star_vectorizer = CountVectorizer()
writer_vectorizer = CountVectorizer()
scaler=MinMaxScaler()

# Vectorize each categorical column separately
genre_vectors = genre_vectorizer.fit_transform(df['genre']).toarray()
director_vectors = director_vectorizer.fit_transform(df['director']).toarray()
star_vectors = star_vectorizer.fit_transform(df['star']).toarray()
writer_vectors = writer_vectorizer.fit_transform(df['writer']).toarray()

# Scale the numerical 'year' column
year_scaled = scaler.fit_transform(df[['year']])

In [None]:
combined_vectors = np.hstack((plot_vectors,genre_vectors, director_vectors, star_vectors, writer_vectors, year_scaled))


In [None]:
combined_vectors.shape

In [None]:
similarity = cosine_similarity(combined_vectors)

In [None]:
def recommend(movie):
  l=[]
  index = df[df['title'] == movie].index[0]
  distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
  for i in distances[1:6]:
      l.append(df.iloc[i[0]].title)
  return l

In [None]:
movie=recommend('Year One')

In [None]:
df[df["title"].isin(movie)]

In [None]:
import pickle

In [None]:
pickle.dump(similarity,open("similarity.pkl","wb"))

In [None]:
from sklearn.neighbors import NearestNeighbors

# Initialize the model
knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on your combined vectors
knn.fit(combined_vectors)

In [None]:
# Number of neighbors to find
k = 5

movie="Batman Begins"
# Index of the target movie
target_index = df[df['title'] == movie].index[0]

# Get the indices of the k-nearest neighbors (excluding the target movie itself)
distances, indices = knn.kneighbors([combined_vectors[target_index]], n_neighbors=k+1)

print("Top 5 recommended movies (by KNN):")
for i in range(1, k+1):  # Start from 1 to exclude the target movie
    neighbor_index = indices.flatten()[i]
    distance = distances.flatten()[i]
    print(f"Movie: {df.iloc[neighbor_index]['title']}, Distance: {distance}")


In [None]:
import joblib

# Assuming 'knn' is your trained KNN model
joblib.dump(knn, 'knn_model.pkl')
