In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import operator
from scipy import spatial
from sklearn.neighbors import NearestNeighbors
from scipy.spatial import minkowski_distance, distance
!pip install imdbpy
from imdb import IMDb
ia = IMDb()

Collecting imdbpy
  Downloading IMDbPY-2021.4.18-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 516 kB/s 
Installing collected packages: imdbpy
Successfully installed imdbpy-2021.4.18


2021-11-05 06:30:13,501 INFO [imdbpy.parser.http.piculet] /opt/conda/lib/python3.7/site-packages/imdb/parser/http/piculet.py:262: using lxml


In [2]:
actors = pd.read_csv("../input/feature-creation/actor_features.csv")
actorIDs = pd.read_csv("../input/checking-on-derived-dataset/k-actors.csv")
kdramas = pd.read_csv("../input/checking-on-derived-dataset/KDrama.csv")

actors = actors.set_index('actor_ID')

In [3]:
genres = []
for idx,row in kdramas.iterrows():
    
    # collecting genres
    gr = row['genres']
    for g in gr.split(','):
        if g not in genres:
            genres.append(g)
            
cols_count = [ g + '_count' for g in genres] 
cols_avgrating = [g + '_avgRating' for g in genres] 
cols_all = [ g + '_count' for g in genres] + [g + '_avgRating' for g in genres]

cols_lead = actors.columns[actors.columns.str.contains("Count")]
cols_posAcc = ['leadTotal', 'secTotal','thirdTotal']

In [4]:
# Normalizing Genre count by dividing it by total drama per actor
for c in cols_count:
    actors[c] = actors[c]/ actors['total_drama']
    
# Normalizing Lead count by dividing it by total drama per actor
for c in cols_lead:
    actors[c] = actors[c]/ actors['total_drama']
    
# Normalizing Acc Lead count by dividing it by total drama per actor
for c in cols_posAcc:
    actors[c] = actors[c]/ actors['total_drama']
    
actors['avgRating'] = actors['total_rating']/actors['total_drama']

scaler = MinMaxScaler()
actors['recency'] = scaler.fit_transform(actors['recency'].values.reshape(-1,1))

In [5]:
def getNeighborsByCols(actorID, K, columns = cols_count):
    distances = []

    vec_actor = actors.loc[actorID, columns]
    
    for cand_id, vec_cand in actors[columns].iterrows():
        if actorID != cand_id:
            dist = distance.minkowski(vec_actor, vec_cand)
            distances.append((cand_id, dist))

    distances.sort(key=operator.itemgetter(1))
    neighbors = []
#     print(distances[:K])
    for x in range(K):
        neighbors.append(distances[x])
    return neighbors

def Similarity(actorID1, actorID2):
    a = actors.loc[actorID1]
    b = actors.loc[actorID2]
    
    genreCountDistance = distance.minkowski(a[cols_count], b[cols_count])
    leadDistance = distance.minkowski(a[cols_posAcc], b[cols_posAcc])  
    genreRatingDistance = distance.minkowski(a[cols_avgrating], b[cols_avgrating])
    return genreCountDistance + leadDistance + genreRatingDistance 

def getNeighbors(actorID, K):
    distances = []

    for index, actor_cand in actors.iterrows():
        if actorID != index:
            dist = Similarity(actorID, index)
            distances.append((index, actor_cand['actor_name'], dist))

    distances.sort(key=operator.itemgetter(2))
    neighbors = []

    for x in range(K):
        neighbors.append(distances[x])
    return neighbors

In [6]:
# Find an actor by actor ID df
selectActor = "Gong Yoo"
K = 5
id_ = actorIDs.loc[actorIDs['actor_name'].str.contains(selectActor), 'actor_ID'].values[0]

descs = ['Drama count per Genre', 'Lead/Not Accumulative', 'Avg Rating per Genre']
columns = [cols_count, cols_posAcc, cols_avgrating]

print("Actor Name:", selectActor)
for i in range(len(descs)):
    print("Closest actors by", descs[i])
    closest_actors = getNeighborsByCols(id_, K, columns[i])
    for i in closest_actors:
        cand_id = i[0]
        cand_name = actorIDs.loc[actorIDs['actor_ID'] == i[0]]['actor_name'].values[0]
        print("Actor ID:", cand_id, ", Actor Name:", cand_name)
    print("")
    
print("Overall")
overall = getNeighbors(id_, 5)
for i in overall:
    print("Actor ID:", i[0], ", Actor Name:", i[1])

Actor Name: Gong Yoo
Closest actors by Drama count per Genre
Actor ID: 10652418 , Actor Name: Jae-Wook Lee
Actor ID: 5374082 , Actor Name: Seong-yeon Park
Actor ID: 2339975 , Actor Name: Park Shin-Hye
Actor ID: 4704907 , Actor Name: Nam Da-reum
Actor ID: 7094545 , Actor Name: Bak Yoon

Closest actors by Lead/Not Accumulative
Actor ID: 1593423 , Actor Name: Tae-woong Eom
Actor ID: 6124994 , Actor Name: Seo Jun Park
Actor ID: 496932 , Actor Name: Lee Byung-hun
Actor ID: 644885 , Actor Name: Ji-Ho Oh
Actor ID: 4798643 , Actor Name: Hee-chul Kim

Closest actors by Avg Rating per Genre
Actor ID: 1868836 , Actor Name: Seul-gi Park
Actor ID: 10893903 , Actor Name: Bae Hae-Sun
Actor ID: 10077115 , Actor Name: Eui-soo Jang
Actor ID: 3241804 , Actor Name: Joon Lee
Actor ID: 2165232 , Actor Name: Yeon-hee Lee

Overall
Actor ID: 1868836 , Actor Name: Seul-gi Park
Actor ID: 3241804 , Actor Name: Joon Lee
Actor ID: 2165232 , Actor Name: Yeon-hee Lee
Actor ID: 1118612 , Actor Name: Dong-wook Kim
Acto

In [7]:
# print("Actor Name:", selectActor)
# print("With K-Nearest Neighbors")

# for i in range(len(descs)):
#     print("\nClosest actors by", descs[i])
#     actor_vec = actors.loc[id_, columns[i]].values.reshape(1, -1)
#     recs = actors.drop(id_, axis = 0)[columns[i]]

#     knn = NearestNeighbors(n_neighbors=K, p=2)
#     knn.fit(recs)

#     idxs = knn.kneighbors(actor_vec, return_distance=False)

#     for i in idxs:
#         print(actors.iloc[i]['actor_name'])

In [8]:
# print("\nClosest actors by Lead/Not per Genre")
# closest_actors = getNeighbors(id_, K, cols_lead)
# for i in closest_actors:
#     cand_id = i[0]
#     cand_name = actorIDs.loc[actorIDs['actor_ID'] == i[0]]['actor_name'].values[0]
#     print("Actor ID:", cand_id, ", Actor Name:", cand_name)

# print("\nClosest actors by Avg Rating per Genre")
# closest_actors = getNeighbors(id_, K, cols_avgrating)
# for i in closest_actors:
#     cand_id = i[0]
#     cand_name = actorIDs.loc[actorIDs['actor_ID'] == i[0]]['actor_name'].values[0]
#     print("Actor ID:", cand_id, ", Actor Name:", cand_name)

# print("\nClosest actors by Averaged Drama Ratings")
# closest_actors = getNeighbors(id_, K,['avgRating'])
# for i in closest_actors:
#     cand_id = i[0]
#     cand_name = actorIDs.loc[actorIDs['actor_ID'] == i[0]]['actor_name'].values[0]
#     print("Actor ID:", cand_id, ", Actor Name:", cand_name)

In [9]:
# actors.loc[actors['actor_name'].str.contains("Kim Seon-Ho"), cols_avgrating] 
# actors1 = actors.set_index('actor_ID')
# actors1 = actors1.drop(['recency_avg', 'total_rating', 'total_drama', 'actor_name'], axis=1)
# actors1.head()

In [10]:
# def recommend_actors(actor_ID, k_recommendations):
#     raw_movies_data = []
#     with open('movies_recommendation_data.csv', 'r') as md:
#         # Discard the first line (headings)
#         next(md)

#         # Read the data into memory
#         for line in md.readlines():
#             data_row = line.strip().split(',')
#             raw_movies_data.append(data_row)

#     actor_vec = actors1.loc[actor_ID]
#     recs = actors1.drop(actor_ID, axis = 0)
    
#     # Use the KNN algorithm to get the 5 movies that are most
#     # similar to The Post.
    
#     >>> from sklearn.neighbors import NearestNeighbors
# >>> knn = NearestNeighbors(n_neighbors=5)
# >>> knn.fit(X)
# NearestNeighbors(algorithm='auto', leaf_size=30, n_neighbors=5, p=2,
#          radius=1.0, warn_on_equidistant=True)
#     recommendation_indices, _ = knn(
#         recs, actor_vec, k=k_recommendations,
#         distance_fn=euclidean_distance, choice_fn=lambda x: None
#     )

#     movie_recommendations = []
#     for _, index in recommendation_indices:
#         movie_recommendations.append(raw_movies_data[index])

#     return movie_recommendations

# if __name__ == '__main__':
#     the_post = [7.2, 1, 1, 0, 0, 0, 0, 1, 0] # feature vector for The Post
#     recommended_movies = recommend_movies(movie_query=the_post, k_recommendations=5)

#     # Print recommended movie titles
#     for recommendation in recommended_movies:
#         print(recommendation[1])