# Extracting features of 2020 movies from Wikipedia

In [17]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

In [18]:
link="https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

In [19]:
source = urllib.request.urlopen(link).read()
soup=bs.BeautifulSoup(source,'lxml')

In [20]:
table=soup.find_all('table',class_='wikitable sortable')

In [21]:
len(table)

4

In [22]:
type(table[0])

bs4.element.Tag

In [23]:
df1=pd.read_html(str(table[0]))[0]
df2=pd.read_html(str(table[1]))[0]
df3=pd.read_html(str(table[2]))[0]
df4=pd.read_html(str(table[3]).replace("'1\"\'",'"1"'))[0]
# avoided "ValueError: invalid literal for int() with base 10: '1"'

In [24]:
df=df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,3,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2]
1,JANUARY,10,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3]
2,JANUARY,10,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4]
3,JANUARY,10,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5]
4,JANUARY,10,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6]
...,...,...,...,...,...,...
225,DECEMBER,25,One Night in Miami,Amazon Studios,Regina King (director); Kemp Powers (screenpla...,[218]
226,DECEMBER,25,Promising Young Woman,Focus Features / FilmNation Entertainment,Emerald Fennell (director/screenplay); Carey M...,[219]
227,DECEMBER,25,Sylvie's Love,Amazon Studios,Eugene Ashe (director/screenplay); Tessa Thomp...,[220]
228,DECEMBER,30,Monster Hunter,Screen Gems / Constantin Film / Tencent Pictur...,Paul W. S. Anderson (director/screenplay); Mil...,[221]


In [25]:
df_2020 = df[['Title','Cast and crew']]

In [26]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = '113096a0c7058d06583090da3f850090'

In [27]:
from tmdbv3api import Movie
tmdb_movie = Movie()
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    try:
        movie_id = result[0].id
    except Exception as e:
        return np.NaN
    
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
        genre_str = " " 
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name'])
        return genre_str.join(genres)
    else:
        return np.NaN

In [28]:
df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))


In [29]:
df_2020

Unnamed: 0,Title,Cast and crew,genres
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy
3,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime
4,The Sonata,Andrew Desmond (director/screenplay); Arthur M...,Horror Thriller Mystery
...,...,...,...
225,One Night in Miami,Regina King (director); Kemp Powers (screenpla...,Drama
226,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,Thriller Crime Drama
227,Sylvie's Love,Eugene Ashe (director/screenplay); Tessa Thomp...,Drama
228,Monster Hunter,Paul W. S. Anderson (director/screenplay); Mil...,Fantasy Action Adventure


In [30]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [31]:
df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(str(x)))


In [32]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [33]:
df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(str(x)))


In [34]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [35]:
df_2020['actor_2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2(str(x)))

In [36]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [37]:
df_2020['actor_3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3(str(x)))

In [38]:
df_2020

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs
4,The Sonata,Andrew Desmond (director/screenplay); Arthur M...,Horror Thriller Mystery,Andrew Desmond,Freya Tingley,Simon Abkarian,Rutger Hauer
...,...,...,...,...,...,...,...
225,One Night in Miami,Regina King (director); Kemp Powers (screenpla...,Drama,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
226,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,Thriller Crime Drama,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie
227,Sylvie's Love,Eugene Ashe (director/screenplay); Tessa Thomp...,Drama,Eugene Ashe,Tessa Thompson,Nnamdi Asomugha,Ryan Michelle Bathe
228,Monster Hunter,Paul W. S. Anderson (director/screenplay); Mil...,Fantasy Action Adventure,Paul W. S. Anderson,Milla Jovovich,Tony Jaa,"Tip ""T.I."" Harris"


In [39]:
df_2020 = df_2020.rename(columns={'Title':'movie_title'})

In [40]:
new_df20 = df_2020.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [41]:
new_df20['actor_2_name'] = new_df20['actor_2_name'].replace(np.nan, 'unknown')
new_df20['actor_3_name'] = new_df20['actor_3_name'].replace(np.nan, 'unknown')
new_df20['genres'] = new_df20['genres'].replace(np.nan, 'unknown')

In [42]:
new_df20

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,Horror Mystery Thriller,The Grudge
1,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,Action Horror Science Fiction Thriller,Underwater
2,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,Comedy,Like a Boss
3,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,Drama Thriller Crime,Inherit the Viper
4,Andrew Desmond,Freya Tingley,Simon Abkarian,Rutger Hauer,Horror Thriller Mystery,The Sonata
...,...,...,...,...,...,...
225,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,One Night in Miami
226,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,Promising Young Woman
227,Eugene Ashe,Tessa Thompson,Nnamdi Asomugha,Ryan Michelle Bathe,Drama,Sylvie's Love
228,Paul W. S. Anderson,Milla Jovovich,Tony Jaa,"Tip ""T.I."" Harris",Fantasy Action Adventure,Monster Hunter


In [43]:
new_df20['comb'] = new_df20['actor_1_name'] + ' ' \
                    + new_df20['actor_2_name'] + ' '\
                    + new_df20['actor_3_name'] + ' '+ new_df20['director_name'] +' ' + new_df20['genres']

In [44]:
new_df20.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [45]:
old_df = pd.read_csv('../clean data/final_data.csv')

In [46]:
final_df = old_df.append(new_df20,ignore_index=True)

In [47]:
final_df.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [48]:
final_df.to_csv('../clean data/main_data.csv',index=False)