# 1. Introduction

# 2. Data Loading & Cleaning

# 3. Basic Description

# 4. EDA & 

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
df = pd.read_csv('../input/database-film-indonesia/indonesian_movies.csv')
df.head()

In [None]:
df.columns

In [None]:
print(df.info(),'\n\n')

print(df.isnull().sum())

In [None]:
df.votes                = df.votes.apply(lambda x: x.replace(',','')).astype('int')
df.actors               = df.actors.apply(lambda x: x[1:-1].replace("'","").split(', '))
df.runtime              = df.runtime.str.replace(r'\D', '').astype('float')
df.movie_id             = df.movie_id.astype('category')

In [None]:
lis=[]
for i in range(1272):
    if df["year"][i] > 2010:
        lis.append('2010-2020')
    elif df["year"][i] <= 2010 and df["year"][i] > 2000:
        lis.append('2000-2010')
    elif df["year"][i] <= 2000 and df["year"][i] > 1990:
        lis.append('1990-2000')
    elif df["year"][i] <= 1990 and df["year"][i] > 1980:
        lis.append('1980-1990')
    elif df["year"][i] <= 1980 and df["year"][i] > 1970:
        lis.append('1970-1980')
    elif df["year"][i] <= 1970 and df["year"][i] > 1960:
        lis.append('1960-1970')
    elif df["year"][i] <= 1960 and df["year"][i] > 1950:
        lis.append('1950-1960')
    else:
        lis.append('<1950')
df['decade'] = lis

- Changing the 'votes' and 'runtime' data type as numerical type to enabling EDA
- Changing the 'actor' data type into list by splitting the string type values for later analysis
- Adding 'decade' variable to classify the movies into a more relevant timeline

In [None]:
#fill the 'nan' value in variable 'directors'
df.directors[137] = str(df.directors[137]).replace('nan','Amar Mukhi')
df.directors[222] = str(df.directors[222]).replace('nan','Tema Patrosza')
df.directors[426] = str(df.directors[426]).replace('nan','Indra Gunawan')
df.directors[1022] = str(df.directors[1022]).replace('nan','Steady Rimba')
df.directors[1046] = str(df.directors[1046]).replace('nan','S.A. Karim')
df.directors[1255] = str(df.directors[1255]).replace('nan','AN Alcaff')
df.directors[1270] = str(df.directors[1270]).replace('nan','Joshua Wong')

In [None]:
#fill the 'nan' values of variable 'runtime'
df['runtime']=df['runtime'].fillna(df.groupby(['decade'])['runtime'].transform('mean'))
df['runtime'].isnull().sum()

- There are some columns of the Data consist of null values such as description, genre, rating, directors, and runtime column. 
- The 'directors' column consist of the least number of null values, and we feel like we can fill it with some information gathered from the internet, so we do it.
- Because the 'runtime' column consists with numerical value, We can fill it with the mean of movies. For each null value, we calculated the mean with every movies in particular decade represents the movie, because we thought that the average of film runtimes is changing over the decades.
- The rest of column with null values are ignored for now, because they consist a non numerical values.

In [None]:
df.describe()

From a quick describe of the data, we learn that:
 - The data are about Indonesian movies made over the years of 1926 - 2020.
 - From the 'year' variable, the median is 2011. It means the data distributed more to the latter year.<br>

We can further run the Exploratory Data Analysis and look for:
 - Most movie genres ever been made over the years
 - Best movies based on users rating
 - Most active Actors who played in the most movies

In [None]:
df_g                = df.groupby(['genre']).size().reset_index()
df_g ['percentage'] = df.groupby(['genre']).size().apply(lambda x: 100*x/ df.groupby(['genre']).size().sum()).values
df_g.columns        = ['genre','counts','percentage']
fig                 = px.bar (df_g, 
                              x='genre',
                              y = 'counts',
                              color='genre',
                              text=df_g['percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
                              title= 'Indonesian Movies by Genre')
fig.show()


df_g                = df.groupby(['decade']).size().reset_index()
df_g ['percentage'] = df.groupby(['decade']).size().apply(lambda x: 100*x/ df.groupby(['decade']).size().sum()).values
df_g.columns        = ['decade','counts','percentage']
fig                 = px.bar (df_g, 
                              x = df_g['decade'],
                              y = 'counts',
                              color='decade',
                              text=df_g['percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
                              title= 'Indonesian Movies by Decade')
fig.update_xaxes(categoryorder='array', categoryarray= ['<1950','1950-1960','1960-1970','1970-1980','1980-1990','1990-2000','2000-2010','2010-2020'])
fig.show()

In [None]:
df.sort_values(by='year').head()

# Exploratory Data Analysis
From the EDA we found that :
- First Indonesian Movie that been made was a Fantasy movie called 'Loetoeng Kasarung' by L. Heuveldorp back in 1926.
- Indonesian movies are dominated by the genre of 'Drama'.
- Indonesian Movies have been produced most at the 2010s.

## Top 5 Indonesian movies
We calculate the top 5 Indonesian Movies by The formula for calculating the Top Rated 250 
by IMDB: http://www.imdb.com/chart/top <br>

with a True Bayesian estimate Formula:
weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C where:

- R = average user rating for the movie = (user_rating)
- v = number of votes for the movie = (votes)
- m = minimum votes required to be listed in the Top 250
- C = the mean user rating across the whole report <br>

The minimum votes required (m) is determined by the 0.75 quantile 'votes' number for every movie in the data (76). The mean user rating across the whole data (C) is calculated by sum of ['rating'] * ['votes'] given for every movies across the whole data divided by the sum of votes participated across the whole data.

In [None]:
print ('EDA of votes participated in the data:')
display(df.votes.describe())

UR_x_votes = df.users_rating*df.votes
the_mean_user_rating = UR_x_votes.sum()/df.votes.sum()
R = df.users_rating
C = the_mean_user_rating
v = df.votes
m = df.votes.quantile(0.75)
df_75 = df[df.votes >= m]
df_75['w_rating'] = (v/(v + m)) * R + (m/(v + m)) * C
df_75 = df_75.sort_values(by = 'w_rating', ascending= False).reset_index(drop=True)
df_75.head()

### Top 5 Indonesian Movies are:<br>
#### - A Man Called Ahok (2018) by Putrama Tuta  [8.67]
#### - Jelita Sejuba: Mencintai Kesatria Negara (2018) by Ray Nayoan  [8.61]
#### - Yowis Ben (2018) by Fajar Nugros  [8.37]
#### - The Raid 2 (2014) by Gareth Evans  [8.00]
#### - 27 Steps of May (2019) by Ravi L. Bharwani  [7.96]

## Indonesian Actors and Directors who played and made the most movies
We calculate actors who played the most movies in two parts of calculation:
- Actors and directors who participated in the most movies
- Actors and directors who participated in the most successful movies by the weighted rating calculated.

In [None]:
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

In [None]:
actors_1d = to_1D(df.actors)
actors_1d = actors_1d[actors_1d != 'nan']
most5_act = actors_1d.value_counts().head(10).reset_index()
most5_act.columns = ['actor', 'sum']
most5_dir = df['directors'].value_counts().head(10).reset_index()
most5_dir.columns = ['directors','sum'] 

display(most5_act)
display(most5_dir)


actors_1d = to_1D(df_75.actors)
actors_1d = actors_1d[actors_1d != 'nan']
most5_act = actors_1d.value_counts().head(10).reset_index()
most5_act.columns = ['actor', 'sum']
most5_dir = df_75['directors'].value_counts().head(10).reset_index()
most5_dir.columns = ['directors','sum']

display(most5_act)
display(most5_dir)

In [None]:
mask = df_75.directors=='Hanung Bramantyo'
df_75[mask]['w_rating'].mean()

- Actor who played the most in Indonesian filmatography is Lukman Sardi with 57 Movies in total.
- Directors who mad the movies in Indonesian Filmatography is Nayato Fio Naula with 61 Movies in total. 

- The highest rated movie actor out of 5 of the most contributed actors in Indonesian Filmatography is Reza Rahadian with an average rating of 6.99
- The highest rated director out of 5 of the most contributed directors in Indonesian Filmatography is Hanung Bramantyo with movie rating of 6.85 in average.

In [None]:
y = pd.Series(most5_act.actor).reset_index(drop = True)
means = []
for i in range(len(y)):
    mask = df_75.actors.apply(lambda x: y[i] in x)
    m    = df_75[mask].w_rating.mean()
    means.append(m)
most5_act['w_rating_average']= means
most5_act

In [None]:
UR_x_votes = most5_act['w_rating_average'] * most5_act['sum']
the_mean_user_rating = UR_x_votes.sum()/most5_act['sum'].sum()
R = most5_act['w_rating_average']
C = the_mean_user_rating
v = most5_act['sum']
m = 14
most5_act['w_rating_t'] = (v/(v + m)) * R + (m/(v + m)) * C

In [None]:
mask = df.actors.apply(lambda x : 'Yayu A.W. Unru' in x)
df[mask]

In [None]:
display(most5_act.sort_values(['sum'], ascending=False))
display(most5_act.sort_values(['w_rating_t'], ascending=False))

In [None]:
df_n = pd.DataFrame()
for i in range ( len(most5_act)) :
    y = most5_act.loc[i,'actor']
    mask = df_75['actors'].apply(lambda x: y in x)
    df_a = df_75[mask]
    df_a['actor_name']= y
    df_n = df_n.append(df_a)
df_n = df_n.reset_index(drop = True)
df_n


In [None]:
df_g                = df_n.groupby(['actor_name','year']).size().reset_index()
df_g ['percentage'] = df_n.groupby(['actor_name','year']).size().apply(lambda x: 100*x/ df_n.groupby(['actor_name','year']).size().sum()).values
df_g.columns        = ['actor_name','year','counts','percentage']
df_g

In [None]:
fig = px.bar(df_g, 
            x = 'year',
            y = 'counts',
            color='actor_name',
            text=df_g['counts'].apply(lambda x: '{0:9.0f}'.format(x)),
            title= 'Best 5 Indonesian Actors best on weighted rating of movies')
fig.show()

In [None]:
df_g                = df_n.groupby(['actor_name','year']).size().reset_index()
df_g['w_rating']    = df_n.groupby(['actor_name','year'])['w_rating'].mean().reset_index()['w_rating']
df_g.columns = ['actor_name','year', 'counts', 'w_rating']

fig = px.bar(df_g,
            x = 'year',
            y = 'counts',
            color= 'actor_name',
            text = df_g['w_rating'].apply(lambda x: '{0:9.2f}'.format(x)))
fig.show()

In [None]:
df_n = pd.DataFrame()
for i in range ( len(most5_dir.head())) :
    y = most5_dir.loc[i,'directors']
    mask = df_75['directors']==y
    df_a = df_75[mask]
    df_a['directors']= y
    df_n = df_n.append(df_a)
df_n = df_n.reset_index(drop = True)
df_n

In [None]:
df_g                = df_n.groupby(['directors','year']).size().reset_index()
df_g ['percentage'] = df_n.groupby(['directors','year']).size().apply(lambda x: 100*x/ df_n.groupby(['directors','year']).size().sum()).values
df_g.columns        = ['directors','year','counts','percentage']
df_g.head()

In [None]:
fig = px.bar(df_g, 
            x = 'year',
            y = 'counts',
            color='directors',
            text=df_g['counts'].apply(lambda x: '{0:9.0f}'.format(x)),
            title= 'Best 5 Indonesian Directors based on weighted rating of movies')
fig.show()

In [None]:
df_g                = df_n.groupby(['directors','year']).size().reset_index()
df_g['w_rating']    = df_n.groupby(['directors','year'])['w_rating'].mean().reset_index()['w_rating']
df_g.columns = ['directors','year', 'counts', 'w_rating']

fig = px.bar(df_g,
            x = 'year',
            y = 'counts',
            color= 'directors',
            text = df_g['w_rating'].apply(lambda x: '{0:9.2f}'.format(x)))
fig.show()

In [None]:
mask = df.directors=='Angga Dwimas Sasongko'
df[df.directors=='Angga Dwimas Sasongko'][df['year']==2020]

- Movie with the highest w_rating shows the people's hype for a film on a certain year. For example, in 2000 people are hyping about Riri Riza's Petualangan Sherina. In 2005, Its Riri Riza's Gie and Hanung Bramantyo's Catatan Akhir Sekolah
- In 2016 people are into movies the most, because of the year's many and high average w_rating of the movies