## John Lehne
## CA06

In [1]:
# importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#read data using URL
url = 'https://github.com/ArinB/CA05-kNN/raw/master/movies_recommendation_data.csv' 
data = pd.read_csv(url, encoding = "ISO-8859-1")

#making sure data was read correctly
data.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


## Data Quality Analysis

In [3]:
# Number of total rows and columns
data.shape

(30, 11)

In [4]:
# Range, column, number of non-null objects, data type, memory usage
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [5]:
# Number of null values for each column
data.isnull().sum()

Movie ID       0
Movie Name     0
IMDB Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64

In [6]:
# Descriptive statistics for the dataset
data.describe()

Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## Prepping the Data for Modeling

In [24]:
# splitting the data into x and y values
# kNN does not take non-numeric values so movie names must be removed from data
x_data = data.loc[:, ['IMDB Rating', 'Biography', 'Drama', 'Thriller', 
                   'Comedy', 'Crime', 'Mystery', 'History']]

y_data = data.loc[:, ['Movie Name']]

In [26]:
# checking table to make sure it is correct
x_data.head()

Unnamed: 0,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,8.0,1,1,1,0,0,0,0
1,7.7,0,1,0,0,0,1,0
2,8.2,1,1,0,0,0,0,0
3,8.3,0,1,0,0,0,0,0
4,8.8,0,1,0,0,0,0,0


In [27]:
# checking table to make sure it is correct
y_data.head()

Unnamed: 0,Movie Name
0,The Imitation Game
1,Ex Machina
2,A Beautiful Mind
3,Good Will Hunting
4,Forrest Gump


## kNN Model

In [9]:
# importing packages from sklearn
from sklearn.neighbors import NearestNeighbors

In [28]:
# building the model with the number of neighbors being 5
recommend = NearestNeighbors(n_neighbors = 5)

# fitting the model to the data
recommend.fit(x_data, y_data)


NearestNeighbors()

In [29]:
# Values in the dataset for "The Post"
movie_values = [7.2, 1, 1, 0, 0, 0, 0, 1]

In [43]:
# returning rows of the recommended movies that are similar to "The Post"
movie_loc = recommend.kneighbors([movie_values], return_distance=False)
movie_loc

  "X does not have valid feature names, but"


array([[28, 27, 29, 16,  2]])

In [46]:
# movie_loc returns as an array
# converting array to a list
nest_list = movie_loc.tolist()
print(nest_list)

# using tolist() makes the array a nested list
# changing the nested list into a standard list
movie_rows = nest_list[0]
print(movie_rows)

[[28, 27, 29, 16, 2]]
[28, 27, 29, 16, 2]


In [53]:
# Returning the movie names based on the row number of the recommender
for row_num in movie_rows:
  print(data.iloc[row_num]['Movie Name'])

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind


## Automating Recommender in a Function

In [85]:
# using the code above, a function was created to return values from a specifed movie in the dataset and return similar movies
def recommender(movie_name, neighbors):
  from sklearn.neighbors import NearestNeighbors
  recommend = NearestNeighbors(n_neighbors = neighbors+1) 
  # need to +1 to neighbors because the kNN model will pull the movie selected as movie_name as the nearest neighbor, which will be dropped later
  recommend.fit(x_data, y_data)
  movie_values = []
  for index, value in data[data['Movie Name'] == movie_name].iterrows():
    movie_values.append([value['IMDB Rating'], value['Biography'], value['Drama'], 
                      value['Thriller'], value['Comedy'], value['Crime'], 
                      value['Mystery'], value['History']])
  movie_criteria = movie_values[0]
  movie_loc = recommend.kneighbors([movie_criteria], return_distance=False)
  nest_list = movie_loc.tolist()
  movie_list = nest_list[0]
  movie_rows = movie_list[1:]
  # need to remove the first record because the kNN model will pull the movie selected as movie_name to be the nearest neighbor (first record)
  print('Here are some recommended movies: \n')
  for row_num in movie_rows:
    print(data.iloc[row_num]['Movie Name'])


In [86]:
# testing the recommender function
recommender('Ex Machina', 6)

Here are some recommended movies: 

Gifted
Finding Forrester
Stand and Deliver
A Brilliant Young Mind
The Karate Kid
Good Will Hunting


  "X does not have valid feature names, but"
