In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv('data/movie/tmdb_5000_credits.csv')
df2 = pd.read_csv('data/movie/tmdb_5000_movies.csv')

In [4]:
df1.rename(columns={'movie_id':'id'}, inplace=True)
df1.columns

Index(['id', 'title', 'cast', 'crew'], dtype='object')

In [5]:
df_temp = df1[['id', 'cast', 'crew']]
df_temp.columns

Index(['id', 'cast', 'crew'], dtype='object')

In [6]:
df = df2.merge(df_temp, on='id')
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

In [7]:
df['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf = TfidfVectorizer(stop_words='english')

In [9]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [10]:
df['overview'].isnull().any()

np.True_

In [11]:
df['overview'].isnull().sum()

np.int64(3)

In [12]:
df['overview'].fillna('' , inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['overview'].fillna('' , inplace=True)


In [13]:
df['overview'].isnull().any()

np.False_

In [14]:
df['overview'].isnull().sum()

np.int64(0)

In [15]:
len(df['overview'])

4803

In [16]:
#BOW Bag of Words 생성
Tfidf_matrix = Tfidf.fit_transform(df['overview'])

In [17]:
Tfidf_matrix.shape

(4803, 20978)

In [18]:
#문장 유사도
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(Tfidf_matrix, Tfidf_matrix)

In [19]:
cosine_sim.shape

(4803, 4803)

In [20]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]], shape=(4803, 4803))

In [21]:
df['title'].head()

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
Name: title, dtype: object

In [22]:
title ='Avatar'
df[df['title']=='Avatar'].index

Index([0], dtype='int64')

In [23]:
title ='Avatar'
idx = df[df['title']==title].index[0]
print(idx)

0


In [24]:
title ='John Carter'
idx = df[df['title']==title].index[0]
print(idx)

4


In [25]:
title ='John Carter'
idx = df[df['title']==title].index[0]
df.iloc[[idx]]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [26]:
cosine_sim[4]

array([0.        , 0.03336868, 0.        , ..., 0.00612609, 0.        ,
       0.        ], shape=(4803,))

In [27]:
test_cosine_sim = list(enumerate(cosine_sim[4]))
test_cosine_sim

[(0, np.float64(0.0)),
 (1, np.float64(0.033368675996424305)),
 (2, np.float64(0.0)),
 (3, np.float64(0.010433403719159351)),
 (4, np.float64(0.9999999999999998)),
 (5, np.float64(0.0)),
 (6, np.float64(0.009339192776152496)),
 (7, np.float64(0.037407042075763064)),
 (8, np.float64(0.0)),
 (9, np.float64(0.01714819056424443)),
 (10, np.float64(0.02015372184154536)),
 (11, np.float64(0.0)),
 (12, np.float64(0.03178129553727139)),
 (13, np.float64(0.0)),
 (14, np.float64(0.010712515993695904)),
 (15, np.float64(0.0)),
 (16, np.float64(0.045257098111540495)),
 (17, np.float64(0.020358841382635908)),
 (18, np.float64(0.016907757517459005)),
 (19, np.float64(0.016254574057966483)),
 (20, np.float64(0.00914487577692374)),
 (21, np.float64(0.01727519014981104)),
 (22, np.float64(0.0)),
 (23, np.float64(0.0)),
 (24, np.float64(0.01651050909357743)),
 (25, np.float64(0.0)),
 (26, np.float64(0.0538746515929863)),
 (27, np.float64(0.05231039324590807)),
 (28, np.float64(0.025467489988822038)),
 (

In [28]:
test_cosine_sim = sorted(test_cosine_sim, key=lambda x:x[1], reverse=True)
test_cosine_sim[:11]

[(4, np.float64(0.9999999999999998)),
 (1254, np.float64(0.20497253140891997)),
 (4161, np.float64(0.16370347641323713)),
 (2932, np.float64(0.12239400129620456)),
 (3349, np.float64(0.11887151040572358)),
 (1307, np.float64(0.11468689553107403)),
 (3068, np.float64(0.11237609168095009)),
 (345, np.float64(0.09079218287485014)),
 (581, np.float64(0.0900594125581888)),
 (2998, np.float64(0.08877535757173034)),
 (4274, np.float64(0.08576462030578072))]

In [29]:
test_cosine_sim = test_cosine_sim[1:11]
index = [i[0] for i in test_cosine_sim]
index

[1254, 4161, 2932, 3349, 1307, 3068, 345, 581, 2998, 4274]

In [30]:
df.loc[index, 'title']

1254                          Get Carter
4161         The Marine 4: Moving Target
2932                        Raising Cain
3349                           Desperado
1307                       The Hurricane
3068                         Rescue Dawn
345                          Rush Hour 2
581              Star Trek: Insurrection
2998                               Devil
4274    Eddie: The Sleepwalking Cannibal
Name: title, dtype: object

In [31]:
def def_cosine_sim():
    #BOW Bag of words 생성
    from sklearn.feature_extraction.text import TfidfVectorizer
    Tfidf = TfidfVectorizer(stop_words='english')
    df = pd.read_csv('data/movie/tmdb_5000_movies.csv')

    df.fillna({'overview':''}, inplace=True)
    Tfidf_matrix = Tfidf.fit_transform(df['overview'])

    #문장 유사도
    from sklearn.metrics.pairwise import linear_kernel
    cosine_sim = linear_kernel(Tfidf_matrix, Tfidf_matrix)
    return cosine_sim

In [32]:
sim = def_cosine_sim()
sim.shape

(4803, 4803)

In [33]:
df[['title']].head()

Unnamed: 0,title
0,Avatar
1,Pirates of the Caribbean: At World's End
2,Spectre
3,The Dark Knight Rises
4,John Carter


In [34]:
title = 'Avatar'
idx = df[df['title']==title].index[0]
print(idx)

0


In [35]:
def recommend(title):
    import pickle
    df = pd.read_csv('data/movie/tmdb_5000_movies.csv')
    idx = df[df['title']==title].index[0]
    
    cosine_sim = pickle.load(open('data/movie/cosine_sim.pickle', 'rb'))
    sim = cosine_sim[idx]
    
    sim = list(enumerate(sim))
    sim = sorted(sim, key=lambda x:x[1], reverse=True)
    sim = sim[1:11]
    index = [x[0] for x in sim]
    return index

In [36]:
index = recommend('Avatar')
index

[3604, 2130, 634, 1341, 529, 1610, 311, 847, 775, 2628]

In [37]:
idx = recommend('Batman Forever')
df.loc[idx, 'title']

3                         The Dark Knight Rises
119                               Batman Begins
65                              The Dark Knight
428                              Batman Returns
210                              Batman & Robin
3854    Batman: The Dark Knight Returns, Part 2
1359                                     Batman
4343                                   Cry_Wolf
174                         The Incredible Hulk
9            Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [38]:
cosine_sim = def_cosine_sim()
cosine_sim[:3]

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ]], shape=(3, 4803))

In [39]:
import pickle
pickle.dump(cosine_sim, open('data/movie/cosine_sim.pickle', 'wb'))

In [40]:
merge = df.copy()
merge.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

In [41]:
#장르, 키워드, 주연배우, 감독
merge.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

In [42]:
from ast import literal_eval
merge['genres'] = merge['genres'].apply(literal_eval)

In [43]:
g = merge.loc[0, 'genres']
type(g)

list

In [44]:
cols = ['cast', 'crew', 'keywords']
for col in cols:
    merge[col] = merge[col].apply(literal_eval)

In [45]:
import numpy as np
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [46]:
get_director(merge.loc[0, 'crew'])

'James Cameron'

In [47]:
merge['director'] = merge['crew'].apply(get_director)

In [48]:
merge['director'].head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4       Andrew Stanton
Name: director, dtype: object

In [49]:
merge['director'].isnull().sum()

np.int64(30)

In [50]:
merge.fillna({'director':''}, inplace=True)

In [51]:
merge['director'].isnull().sum()

np.int64(0)

In [52]:
merge.loc[0, 'cast']

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1},
 {'cast_id': 25,
  'character': 'Dr. Grace Augustine',
  'credit_id': '52fe48009251416c750aca39',
  'gender': 1,
  'id': 10205,
  'name': 'Sigourney Weaver',
  'order': 2},
 {'cast_id': 4,
  'character': 'Col. Quaritch',
  'credit_id': '52fe48009251416c750ac9cf',
  'gender': 2,
  'id': 32747,
  'name': 'Stephen Lang',
  'order': 3},
 {'cast_id': 5,
  'character': 'Trudy Chacon',
  'credit_id': '52fe48009251416c750ac9d3',
  'gender': 1,
  'id': 17647,
  'name': 'Michelle Rodriguez',
  'order': 4},
 {'cast_id': 8,
  'character': 'Selfridge',
  'credit_id': '52fe48009251416c750ac9e1',
  'gender': 2,
  'id': 1771,
  'name': 'Giovanni Ribisi',
  'order': 5},
 {'cast_id': 7,
  'c

In [53]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names)>3:
            names = names[:3]
        return names
    return []

In [54]:
cols = ['cast', 'keywords', 'genres']
for col in cols:
    merge[col] = merge[col].apply(get_list)

In [55]:
merge[['title', 'director', 'cast', 'keywords', 'genres']].head()

Unnamed: 0,title,director,cast,keywords,genres
0,Avatar,James Cameron,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,Sam Mendes,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[spy, based on novel, secret agent]","[Action, Adventure, Crime]"
3,The Dark Knight Rises,Christopher Nolan,"[Christian Bale, Michael Caine, Gary Oldman]","[dc comics, crime fighter, terrorist]","[Action, Crime, Drama]"
4,John Carter,Andrew Stanton,"[Taylor Kitsch, Lynn Collins, Samantha Morton]","[based on novel, mars, medallion]","[Action, Adventure, Science Fiction]"


In [56]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i).replace(' ', '') for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x).replace(' ', '')
        else: 
            return ''

In [57]:
cols = ['director', 'cast', 'keywords', 'genres']
for col in cols:
    merge[col] = merge[col].apply(clean_data)

In [58]:
merge[['title', 'director', 'cast', 'keywords', 'genres']].head()

Unnamed: 0,title,director,cast,keywords,genres
0,Avatar,jamescameron,"[samworthington, zoesaldana, sigourneyweaver]","[cultureclash, future, spacewar]","[action, adventure, fantasy]"
1,Pirates of the Caribbean: At World's End,goreverbinski,"[johnnydepp, orlandobloom, keiraknightley]","[ocean, drugabuse, exoticisland]","[adventure, fantasy, action]"
2,Spectre,sammendes,"[danielcraig, christophwaltz, léaseydoux]","[spy, basedonnovel, secretagent]","[action, adventure, crime]"
3,The Dark Knight Rises,christophernolan,"[christianbale, michaelcaine, garyoldman]","[dccomics, crimefighter, terrorist]","[action, crime, drama]"
4,John Carter,andrewstanton,"[taylorkitsch, lynncollins, samanthamorton]","[basedonnovel, mars, medallion]","[action, adventure, sciencefiction]"


In [59]:
def create_soup(x):
    str = x['director'] + ' '
    str += ' '.join(x['keywords']) + ' '
    str += ' '.join(x['cast']) + ' '
    str += ' '.join(x['genres']) + ' '
    return str

In [60]:
merge['soup'] = merge.apply(create_soup, axis=1)

In [61]:
merge['soup'].head()

0    jamescameron cultureclash future spacewar samw...
1    goreverbinski ocean drugabuse exoticisland joh...
2    sammendes spy basedonnovel secretagent danielc...
3    christophernolan dccomics crimefighter terrori...
4    andrewstanton basedonnovel mars medallion tayl...
Name: soup, dtype: object

In [62]:
merge.to_csv('data/movie/movies.csv', index=False)

In [63]:
merge['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [64]:
df_movie = pd.read_csv('data/movie/movies.csv')
df_movie['soup']

0       jamescameron cultureclash future spacewar samw...
1       goreverbinski ocean drugabuse exoticisland joh...
2       sammendes spy basedonnovel secretagent danielc...
3       christophernolan dccomics crimefighter terrori...
4       andrewstanton basedonnovel mars medallion tayl...
                              ...                        
4798    robertrodriguez unitedstates–mexicobarrier leg...
4799    edwardburns  edwardburns kerrybishé marshadiet...
4800    scottsmith date loveatfirstsight narration eri...
4801     danielhsia  danielhenney elizacoupe billpaxton  
4802    brianherzlinger obsession camcorder crush drew...
Name: soup, Length: 4803, dtype: object

In [65]:
def def_cosine_sim2():
    #BOW Bag of words 생성
    from sklearn.feature_extraction.text import CountVectorizer
    counter = CountVectorizer(stop_words='english')

    df = pd.read_csv('data/movie/movies.csv')
    df.fillna({'soup':''}, inplace=True)
    counter_matrix = counter.fit_transform(df['soup'])

    #문장 유사도
    from sklearn.metrics.pairwise import linear_kernel
    cosine_sim = linear_kernel(counter_matrix, counter_matrix)
    return cosine_sim

In [66]:
import pickle
pickle.dump(cosine_sim, open('data/movie/cosine_sim.pickle', 'wb'))

In [67]:
cosine_sim2 = def_cosine_sim2()
pickle.dump(cosine_sim2, open('data/movie/cosine_sim2.pickle', 'wb'))

In [68]:
def recommend2(title):
    import pickle
    df = pd.read_csv('data/movie/tmdb_5000_movies.csv')
    idx = df[df['title']==title].index[0]
    
    cosine_sim = pickle.load(open('data/movie/cosine_sim2.pickle', 'rb'))
    sim = cosine_sim[idx]
    
    sim = list(enumerate(sim))
    sim = sorted(sim, key=lambda x:x[1], reverse=True)
    sim = sim[1:11]
    index = [x[0] for x in sim]
    return index

In [69]:
index = recommend2('Superman Returns')
df.loc[index, 'title']

14                                  Man of Steel
813                                     Superman
870                                  Superman II
9             Batman v Superman: Dawn of Justice
46                    X-Men: Days of Future Past
1139                           The Warrior's Way
1296                                Superman III
2433            Superman IV: The Quest for Peace
0                                         Avatar
1       Pirates of the Caribbean: At World's End
Name: title, dtype: object

In [70]:
df = pd.read_csv('data/movie/movies.csv')
df.loc[0, ['id', 'title']]

id        19995
title    Avatar
Name: 0, dtype: object

In [77]:
from tmdbv3api import Movie, TMDb
tmdb = TMDb()
tmdb.api_key='c668cda4cf75bf267ef2aeffa2da0341'
tmdb.language='ko-KR'
movie = Movie()

details = movie.details('19995')
title = details['title']
poster = 'https://image.tmdb.org/t/p/w500' + details['poster_path']
overview = details['overview']
print(title)
print(poster)
print(overview)

아바타
https://image.tmdb.org/t/p/w500/m5lCha2XcbDowDoYHPc0DTNaCPU.jpg
가까운 미래, 지구는 에너지 고갈 문제를 해결하기 위해 머나먼 행성 판도라에서 대체 자원을 채굴하기 시작한다. 하지만 판도라의 독성을 지닌 대기로 인해 자원 획득에 어려움을 겪게 된 인류는 판도라의 토착민 나비의 외형에 인간의 의식을 주입, 원격 조종이 가능한 새로운 생명체를 탄생시키는 프로그램을 개발한다. 한편 하반신이 마비된 전직 해병대원 제이크 설리는 아바타 프로그램에 참가할 것을 제안받는다. 그 곳에서 자신의 아바타를 통해 자유롭게 걸을 수 있게 된 제이크는 자원 채굴을 막으려는 나비의 무리에 침투하라는 임무를 부여받는데...
