# Model for content based filtering
This notebook process data to create a ml model.

## Loading data from csv

In [2]:
import pandas as pd
df = pd.read_csv("dataset_movies.csv")
df.set_index("id",inplace=True)
# Compute mean of columns
runtime_mean = int(df["runtime"].mean())
budget_mean = int(df["budget"].mean())
revenue_mean = int(df["revenue"].mean())

# Substitute runtime NaN with average runtime
df["runtime"].replace(0,runtime_mean,inplace=True)
df["budget"].replace(0,budget_mean,inplace=True)
df["revenue"].replace(0,revenue_mean,inplace=True)
df

Unnamed: 0_level_0,budget,original_title,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2787530,Ariel,8.165,10/21/1988,7141894,73,6.8,157,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2787530,Varjoja paratiisissa,8.509,10/17/1986,7141894,74,7.2,158,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4000000,Four Rooms,14.441,12/9/1995,4257354,98,5.7,2127,0,0,...,0,0,0,0,0,0,0,0,0,0
6,21000000,Judgment Night,13.336,10/15/1993,12136938,110,6.5,230,1,0,...,0,0,0,0,0,0,0,1,0,0
8,42000,Life in Loops (A Megacities RMX),2.352,1/1/2006,7141894,80,7.5,18,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84359,2787530,Not Like Us,0.697,8/15/1995,7141894,87,4.8,2,0,0,...,0,1,0,0,0,1,0,0,0,0
84360,2787530,The Ghastly Ones,2.414,9/6/1968,7141894,81,4.2,9,0,0,...,0,1,0,0,0,0,0,0,0,0
84361,2787530,Partizani,3.124,7/17/1974,7141894,104,6.3,8,0,0,...,0,0,0,0,0,0,0,1,1,0
84362,2787530,El caníbal,7.851,12/5/1980,7141894,102,4.5,21,0,0,...,0,1,0,0,0,0,0,0,0,0


## Preprocessing data to ML Model

### Drop Dates before 1900
We drop movies before 1900.

In [3]:
import re
def find_1800(date):
    """This function find for "yyyy-mm-dd" or "yyyy-dd-mm" format date.\n
    Return a bool series list where true if are occurence."""
    return bool(re.findall(r"[0-9]{4}-[0-9]{2}-[0-9]{2}",date))

print("Rows to drop.")
print(df["release_date"][df["release_date"].apply(find_1800)])
drop_index = df["release_date"][df["release_date"].apply(find_1800)].index
df.drop(drop_index,inplace=True)

print("-----------------------------------------")
print("Rows in 'yyyy-mm-dd' format after drop")
print(df["release_date"][df["release_date"].apply(find_1800)])

Rows to drop.
id
35975    1898-01-01
49295    1899-01-01
49296    1898-01-01
82120    1895-06-10
Name: release_date, dtype: object
-----------------------------------------
Rows in 'yyyy-mm-dd' format after drop
Series([], Name: release_date, dtype: object)


### Extracting title and release date column
We will use this dataframe to extract name by id.

In [4]:
titles = df[["original_title","release_date"]]
titles

Unnamed: 0_level_0,original_title,release_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Ariel,10/21/1988
3,Varjoja paratiisissa,10/17/1986
5,Four Rooms,12/9/1995
6,Judgment Night,10/15/1993
8,Life in Loops (A Megacities RMX),1/1/2006
...,...,...
84359,Not Like Us,8/15/1995
84360,The Ghastly Ones,9/6/1968
84361,Partizani,7/17/1974
84362,El caníbal,12/5/1980


### Eliminate title column from dataframe

In [5]:
df.drop("original_title",axis=1,inplace=True)
df

Unnamed: 0_level_0,budget,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2787530,8.165,10/21/1988,7141894,73,6.8,157,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2787530,8.509,10/17/1986,7141894,74,7.2,158,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4000000,14.441,12/9/1995,4257354,98,5.7,2127,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,21000000,13.336,10/15/1993,12136938,110,6.5,230,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,42000,2.352,1/1/2006,7141894,80,7.5,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84359,2787530,0.697,8/15/1995,7141894,87,4.8,2,0,0,0,...,0,1,0,0,0,1,0,0,0,0
84360,2787530,2.414,9/6/1968,7141894,81,4.2,9,0,0,0,...,0,1,0,0,0,0,0,0,0,0
84361,2787530,3.124,7/17/1974,7141894,104,6.3,8,0,0,0,...,0,0,0,0,0,0,0,1,1,0
84362,2787530,7.851,12/5/1980,7141894,102,4.5,21,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Transform date to ordinal

In [6]:
from datetime import datetime

def date_to_ordinal(date):
    """Transform date in format %Y/%m/%d to ordinal date type."""
    if date != "0":
        return datetime.strptime(date, "%m/%d/%Y").toordinal()
    else:
        return 0

#apply method apply function in all element column(s)
#transform method is similar to apply
df["release_date"] = df["release_date"].apply(date_to_ordinal)
df

Unnamed: 0_level_0,budget,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2787530,8.165,726031,7141894,73,6.8,157,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2787530,8.509,725296,7141894,74,7.2,158,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4000000,14.441,728636,4257354,98,5.7,2127,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,21000000,13.336,727851,12136938,110,6.5,230,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,42000,2.352,732312,7141894,80,7.5,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84359,2787530,0.697,728520,7141894,87,4.8,2,0,0,0,...,0,1,0,0,0,1,0,0,0,0
84360,2787530,2.414,718681,7141894,81,4.2,9,0,0,0,...,0,1,0,0,0,0,0,0,0,0
84361,2787530,3.124,720821,7141894,104,6.3,8,0,0,0,...,0,0,0,0,0,0,0,1,1,0
84362,2787530,7.851,723154,7141894,102,4.5,21,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Replace 0 release date for mean ordinal date

In [7]:
release_date_mean = int(df["release_date"].mean())
df["release_date"].replace(0,release_date_mean,inplace=True)
df

Unnamed: 0_level_0,budget,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2787530,8.165,726031,7141894,73,6.8,157,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2787530,8.509,725296,7141894,74,7.2,158,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4000000,14.441,728636,4257354,98,5.7,2127,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,21000000,13.336,727851,12136938,110,6.5,230,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,42000,2.352,732312,7141894,80,7.5,18,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84359,2787530,0.697,728520,7141894,87,4.8,2,0,0,0,...,0,1,0,0,0,1,0,0,0,0
84360,2787530,2.414,718681,7141894,81,4.2,9,0,0,0,...,0,1,0,0,0,0,0,0,0,0
84361,2787530,3.124,720821,7141894,104,6.3,8,0,0,0,...,0,0,0,0,0,0,0,1,1,0
84362,2787530,7.851,723154,7141894,102,4.5,21,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Normalize data using StandardScaler

In [8]:
from sklearn.preprocessing import StandardScaler

# Create scaler object (x-mu)/sigma
scaler = StandardScaler()
# Just normalize first 7 columns
scaler.fit(df.iloc[:, :7])

#Create new pandas dataframe
df_standard = pd.DataFrame(scaler.transform(df.iloc[:,:7]),columns=df.iloc[:,:7].columns)
df_standard["id"] = df.index
df_standard.set_index("id",inplace=True)

# Concat standar dataframe (first 7 columns) with df[:,7:]
df_standard = pd.concat([df_standard,df.iloc[:,7:]],axis=1)
df_standard

Unnamed: 0_level_0,budget,popularity,release_date,revenue,runtime,vote_average,vote_count,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.177502,0.223050,-0.104358,-0.131162,-0.581701,0.672688,-0.020399,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.177502,0.252091,-0.196775,-0.131162,-0.553708,0.854488,-0.019327,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.085400,0.752883,0.223191,-0.192338,0.118125,0.172738,2.092590,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1.205960,0.659596,0.124486,-0.025226,0.454041,0.536338,0.057899,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,-0.386059,-0.267695,0.685405,-0.131162,-0.385749,0.990838,-0.169488,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84359,-0.177502,-0.407414,0.208605,-0.131162,-0.189798,-0.236312,-0.186650,0,0,0,...,0,1,0,0,0,1,0,0,0,0
84360,-0.177502,-0.262461,-1.028534,-0.131162,-0.357756,-0.509011,-0.179142,0,0,0,...,0,1,0,0,0,0,0,0,0,0
84361,-0.177502,-0.202521,-0.759454,-0.131162,0.286083,0.445438,-0.180214,0,0,0,...,0,0,0,0,0,0,0,1,1,0
84362,-0.177502,0.196542,-0.466107,-0.131162,0.230097,-0.372661,-0.166271,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Using Metric to fin top 10 Movies

### Looking for id Movie

In [26]:
from jellyfish import levenshtein_distance

# Looking for similarity names movies
movie_name = "Batman: The Dark Knight"

def autocomplete_movies(string):
    return levenshtein_distance(movie_name,string)

idx = titles["original_title"].apply(autocomplete_movies).sort_values()[:10].index
titles.loc[idx]

Unnamed: 0_level_0,original_title,release_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
13851,Batman: Gotham Knight,7/8/2008
50556,Batbabe: The Dark Nightie,2/24/2009
72003,The Dark Knight,7/11/2011
155,The Dark Knight,7/14/2008
69735,Batman: Year One,9/27/2011
35692,The Black Knight,8/26/1954
37935,One Dark Night,12/31/1982
60597,Red: The Dark Side,3/9/2007
78461,Beneath the Darkness,1/6/2012
13362,Taxi to the Dark Side,4/30/2007


### Using euclidean distance to find similar movies

In [27]:
from sklearn.metrics.pairwise import euclidean_distances

movie_id = 13851 # Movie id to test
print("Selected movie:",
      titles.loc[movie_id].values[0],
      titles.loc[movie_id].values[1])

n = 10      # Number of similar movies

# Calculate distances (euclidean distance)
distances = euclidean_distances(
    df_standard.loc[movie_id].values.reshape(1, -1),
    df_standard)
distances = pd.DataFrame(distances,columns=df.index)

# Order values by ascending order
recomendations = distances.iloc[0].sort_values()[1:n+1]

# Get names of movies
top = titles.loc[recomendations.index]
top.columns = ["Title","Release Date"] # Rename columns
top

Selected movie: Batman: Gotham Knight 7/8/2008


Unnamed: 0_level_0,Title,Release Date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
30061,Justice League: Crisis on Two Earths,2/23/2010
14611,Ultimate Avengers 2,8/8/2006
56590,All Star Superman,2/22/2011
14609,Ultimate Avengers: The Movie,2/21/2006
13647,The Invincible Iron Man,1/23/2007
14092,攻殻機動隊 2.0,7/12/2008
17445,Green Lantern: First Flight,7/28/2009
23446,ワンピース ねじまき島の冒険,3/3/2001
14613,Next Avengers: Heroes of Tomorrow,9/2/2008
14011,Justice League: The New Frontier,2/26/2008
