**Import Libraries:**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Upload the file**

In [None]:
df_movies=pd.read_csv("/content/movie_metadata.csv")
org_data=df_movies

In [None]:
df_movies.head(2)   # Display top 2 records

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0


In [None]:
df_movies.shape   # To show shape of the dataframe

(5043, 28)

In [None]:
df_movies.info()   # Display information abt dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

**Cleaning Data**

In [None]:
df_movies.isnull().sum().sort_values(ascending=False)  # Find the missing values column wise

gross                        884
budget                       492
aspect_ratio                 329
content_rating               303
plot_keywords                153
title_year                   108
director_name                104
director_facebook_likes      104
num_critic_for_reviews        50
actor_3_name                  23
actor_3_facebook_likes        23
num_user_for_reviews          21
color                         19
duration                      15
facenumber_in_poster          13
actor_2_name                  13
actor_2_facebook_likes        13
language                      12
actor_1_name                   7
actor_1_facebook_likes         7
country                        5
cast_total_facebook_likes      0
num_voted_users                0
movie_title                    0
movie_imdb_link                0
genres                         0
imdb_score                     0
movie_facebook_likes           0
dtype: int64

In [None]:
df_movies.isnull().sum(axis=1).sort_values(ascending=False)  # Find the missing values row wise

279     15
4       14
4945    11
2241    11
2342    10
        ..
1703     0
1702     0
1701     0
1700     0
5042     0
Length: 5043, dtype: int64

In [None]:
df_movies.isnull().sum().sort_values(ascending=False)/len(df_movies)*100   # find column wise null percentage

gross                        17.529248
budget                        9.756098
aspect_ratio                  6.523895
content_rating                6.008328
plot_keywords                 3.033908
title_year                    2.141582
director_name                 2.062265
director_facebook_likes       2.062265
num_critic_for_reviews        0.991473
actor_3_name                  0.456078
actor_3_facebook_likes        0.456078
num_user_for_reviews          0.416419
color                         0.376760
duration                      0.297442
facenumber_in_poster          0.257783
actor_2_name                  0.257783
actor_2_facebook_likes        0.257783
language                      0.237954
actor_1_name                  0.138806
actor_1_facebook_likes        0.138806
country                       0.099147
cast_total_facebook_likes     0.000000
num_voted_users               0.000000
movie_title                   0.000000
movie_imdb_link               0.000000
genres                   

### **Drop the unnecessasry columns**

In [None]:
df_movies=df_movies.drop([
    'color','director_facebook_likes','actor_3_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes',
    'cast_total_facebook_likes','facenumber_in_poster','content_rating','plot_keywords',
    'actor_2_name','actor_3_name','duration','aspect_ratio','movie_imdb_link','country'
],axis=1)

In [None]:
df_movies.shape

(5043, 13)

### **Drop unnecessary rows**

In [None]:
# Drop unnecessary rows (rows which have more null values)
round(df_movies.isnull().sum().sort_values(ascending=False)/len(df_movies)*100,2)

gross                     17.53
budget                     9.76
title_year                 2.14
director_name              2.06
num_critic_for_reviews     0.99
num_user_for_reviews       0.42
language                   0.24
actor_1_name               0.14
genres                     0.00
movie_title                0.00
num_voted_users            0.00
imdb_score                 0.00
movie_facebook_likes       0.00
dtype: float64

In [None]:
# Clean the data of columns gross and budget
df_movies=df_movies[df_movies['gross'].notnull()]
df_movies=df_movies[df_movies['budget'].notnull()]


In [None]:
# Again check the data of rows (rows which have more null values)
round(df_movies.isnull().sum().sort_values(ascending=False)/len(df_movies)*100,2)

actor_1_name              0.08
language                  0.08
num_critic_for_reviews    0.03
director_name             0.00
gross                     0.00
genres                    0.00
movie_title               0.00
num_voted_users           0.00
num_user_for_reviews      0.00
budget                    0.00
title_year                0.00
imdb_score                0.00
movie_facebook_likes      0.00
dtype: float64

In [None]:
df_movies.isnull().sum(axis=1).sort_values(ascending=False)   # Check the null values of rows

4502    1
4720    1
4110    1
4837    1
4711    1
       ..
1375    0
1376    0
1377    0
1378    0
5042    0
Length: 3891, dtype: int64

In [None]:
# Check the null values of rows which have more than 5
(df_movies.isnull().sum(axis=1).sort_values(ascending=False)>5).sum()

0

In [None]:
df_movies=df_movies[df_movies.isnull().sum(axis=1).sort_values(ascending=False)<=5]

  df_movies=df_movies[df_movies.isnull().sum(axis=1).sort_values(ascending=False)<=5]


### **Fill NAN Values**

In [None]:
round(df_movies.isnull().sum().sort_values(ascending=False)/len(df_movies)*100,2)

actor_1_name              0.08
language                  0.08
num_critic_for_reviews    0.03
director_name             0.00
gross                     0.00
genres                    0.00
movie_title               0.00
num_voted_users           0.00
num_user_for_reviews      0.00
budget                    0.00
title_year                0.00
imdb_score                0.00
movie_facebook_likes      0.00
dtype: float64

In [None]:
df_movies['language'].value_counts().nlargest(5)

English     3707
French        37
Spanish       26
Mandarin      15
German        13
Name: language, dtype: int64

In [None]:
df_movies.language.describe()

count        3888
unique         38
top       English
freq         3707
Name: language, dtype: object

**Observations:** Here total 3888 movies in that 3707 movies are English movies remaining movies are related to different(37) languages.

In [None]:
df_movies.language=df_movies.language.fillna('English')

In [None]:
round(df_movies.isnull().sum().sort_values(ascending=False)/len(df_movies)*100,2)

actor_1_name              0.08
num_critic_for_reviews    0.03
director_name             0.00
gross                     0.00
genres                    0.00
movie_title               0.00
num_voted_users           0.00
num_user_for_reviews      0.00
language                  0.00
budget                    0.00
title_year                0.00
imdb_score                0.00
movie_facebook_likes      0.00
dtype: float64

###**Check the number of retained rows**

You might notice that two of the columns num_critic_for_reviews and actor_1_name have small percentage of NaN valuesleft. You can let these columns as it is for now.Check the number and percentage of the rows retained after completing all the tasks above.

In [None]:
len(df_movies)

3891

In [None]:
# Checking the no.of  retained rows here ( retained meaning-continue to hold )
round(len(df_movies)/len(org_data)*100,2)

77.16

**Observarions:** You might have noticed that we still have around 77% of rows.

## **Data Analysis**

**Change the unit of columns**



In [None]:
df_movies['budget']/1000000

0       237.0000
1       300.0000
2       245.0000
3       250.0000
5       263.7000
          ...   
5033      0.0070
5034      0.0070
5035      0.0070
5037      0.0090
5042      0.0011
Name: budget, Length: 3891, dtype: float64

In [None]:
# Convert the unit of the budget and gross columns from $ to million $
df_movies['budget']=df_movies['budget']/1000000
df_movies['gross']=df_movies['gross']/1000000

**Find the movies with highest profit**

Create a new column called profit which contains differece of the two columns gross and budget

In [None]:
df_movies['profit']=df_movies['gross']-df_movies['budget']
df_movies

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0000,2009.0,7.9,33000,523.505847
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0000,2007.0,7.1,0,9.404152
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0000,2015.0,6.8,85000,-44.925825
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0000,2012.0,8.5,164000,198.130642
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7000,2012.0,6.6,24000,-190.641321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5033,Shane Carruth,143.0,0.424760,Drama|Sci-Fi|Thriller,Shane Carruth,Primer,72639,371.0,English,0.0070,2004.0,7.0,19000,0.417760
5034,Neill Dela Llana,35.0,0.070071,Thriller,Ian Gamazon,Cavite,589,35.0,English,0.0070,2005.0,6.3,74,0.063071
5035,Robert Rodriguez,56.0,2.040920,Action|Crime|Drama|Romance|Thriller,Carlos Gallardo,El Mariachi,52055,130.0,Spanish,0.0070,1992.0,6.9,0,2.033920
5037,Edward Burns,14.0,0.004584,Comedy|Drama,Kerry Bishé,Newlyweds,1338,14.0,English,0.0090,2011.0,6.4,413,-0.004416


**Observations:** Here we see a new column profit added at end ot the data set.In profit some values shown in negative means the movies which got loss.

In [None]:
# Set the dataframe using the profit column as reference.
df_movies.sort_values(by='profit',ascending=False)

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.000000,2009.0,7.9,33000,523.505847
29,Colin Trevorrow,644.0,652.177271,Action|Adventure|Sci-Fi|Thriller,Bryce Dallas Howard,Jurassic World,418214,1290.0,English,150.000000,2015.0,7.0,150000,502.177271
26,James Cameron,315.0,658.672302,Drama|Romance,Leonardo DiCaprio,Titanic,793059,2528.0,English,200.000000,1997.0,7.7,26000,458.672302
3024,George Lucas,282.0,460.935665,Action|Adventure|Fantasy|Sci-Fi,Harrison Ford,Star Wars: Episode IV - A New Hope,911097,1470.0,English,11.000000,1977.0,8.7,33000,449.935665
3080,Steven Spielberg,215.0,434.949459,Family|Sci-Fi,Henry Thomas,E.T. the Extra-Terrestrial,281842,515.0,English,10.500000,1982.0,7.9,34000,424.449459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2334,Katsuhiro Ôtomo,105.0,0.410388,Action|Adventure|Animation|Family|Sci-Fi|Thriller,William Hootkins,Steamboy,13727,79.0,Japanese,2127.519898,2004.0,6.9,973,-2127.109510
2323,Hayao Miyazaki,174.0,2.298191,Adventure|Animation|Fantasy,Minnie Driver,Princess Mononoke,221552,570.0,Japanese,2400.000000,1997.0,8.4,11000,-2397.701809
3005,Lajos Koltai,73.0,0.195888,Drama|Romance|War,Marcell Nagy,Fateless,5603,45.0,Hungarian,2500.000000,2005.0,7.1,607,-2499.804112
3859,Chan-wook Park,202.0,0.211667,Crime|Drama,Min-sik Choi,Lady Vengeance,53508,131.0,Korean,4200.000000,2005.0,7.7,4000,-4199.788333


In [None]:
#Extract the top ten profiting movies in descending order and store them in a new dataframe-top10
top10 =df_movies.sort_values(by='profit',ascending=False).head(10)
top10

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0,2009.0,7.9,33000,523.505847
29,Colin Trevorrow,644.0,652.177271,Action|Adventure|Sci-Fi|Thriller,Bryce Dallas Howard,Jurassic World,418214,1290.0,English,150.0,2015.0,7.0,150000,502.177271
26,James Cameron,315.0,658.672302,Drama|Romance,Leonardo DiCaprio,Titanic,793059,2528.0,English,200.0,1997.0,7.7,26000,458.672302
3024,George Lucas,282.0,460.935665,Action|Adventure|Fantasy|Sci-Fi,Harrison Ford,Star Wars: Episode IV - A New Hope,911097,1470.0,English,11.0,1977.0,8.7,33000,449.935665
3080,Steven Spielberg,215.0,434.949459,Family|Sci-Fi,Henry Thomas,E.T. the Extra-Terrestrial,281842,515.0,English,10.5,1982.0,7.9,34000,424.449459
794,Joss Whedon,703.0,623.279547,Action|Adventure|Sci-Fi,Chris Hemsworth,The Avengers,995415,1722.0,English,220.0,2012.0,8.1,123000,403.279547
17,Joss Whedon,703.0,623.279547,Action|Adventure|Sci-Fi,Chris Hemsworth,The Avengers,995415,1722.0,English,220.0,2012.0,8.1,123000,403.279547
509,Roger Allers,186.0,422.783777,Adventure|Animation|Drama|Family|Musical,Matthew Broderick,The Lion King,644348,656.0,English,45.0,1994.0,8.5,17000,377.783777
240,George Lucas,320.0,474.544677,Action|Adventure|Fantasy|Sci-Fi,Natalie Portman,Star Wars: Episode I - The Phantom Menace,534658,3597.0,English,115.0,1999.0,6.5,13000,359.544677
66,Christopher Nolan,645.0,533.316061,Action|Crime|Drama|Thriller,Christian Bale,The Dark Knight,1676169,4667.0,English,185.0,2008.0,9.0,37000,348.316061


**Observations:** After you found out top 10 movies, you might have noticed duplicate values at row 17 and 794. It seems the dataframe has duplicate values as well.Drop the duplicate values.

###Drop Duplicate Values

In [None]:
df_movies.drop_duplicates(keep='first',inplace=True)

In [None]:
df_movies.shape

(3856, 14)

##Find IMDB Top 250

Create the new dataframe IMDB_Top_250 and store the top 250 movies with highest IMDB rating(corresponding the column imdb_score). Also make sure that all of these movies num_voted_users greater than 25000.Also add a rank column containing the values 1 to 250 indicating the ranks of the corresponding films.

In [None]:
IMDB_Top_250 =df_movies[df_movies['num_voted_users']>25000].sort_values(by='imdb_score',ascending=False).head(250)
IMDB_Top_250

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit
1937,Frank Darabont,199.0,28.341469,Crime|Drama,Morgan Freeman,The Shawshank Redemption,1689764,4144.0,English,25.00,1994.0,9.3,108000,3.341469
3466,Francis Ford Coppola,208.0,134.821952,Crime|Drama,Al Pacino,The Godfather,1155770,2238.0,English,6.00,1972.0,9.2,43000,128.821952
2837,Francis Ford Coppola,149.0,57.300000,Crime|Drama,Robert De Niro,The Godfather: Part II,790926,650.0,English,13.00,1974.0,9.0,14000,44.300000
66,Christopher Nolan,645.0,533.316061,Action|Crime|Drama|Thriller,Christian Bale,The Dark Knight,1676169,4667.0,English,185.00,2008.0,9.0,37000,348.316061
4498,Sergio Leone,181.0,6.100000,Western,Clint Eastwood,"The Good, the Bad and the Ugly",503509,780.0,Italian,1.20,1966.0,8.9,20000,4.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,John Carney,232.0,9.437933,Drama|Music|Romance,Glen Hansard,Once,90827,329.0,English,0.18,2007.0,7.9,26000,9.257933
2605,Ang Lee,287.0,128.067808,Action|Drama|Romance,Chen Chang,"Crouching Tiger, Hidden Dragon",217740,1641.0,Mandarin,15.00,2000.0,7.9,0,113.067808
3029,David O. Russell,410.0,93.571803,Biography|Drama|Sport,Christian Bale,The Fighter,275869,389.0,English,25.00,2010.0,7.9,36000,68.571803
2177,Tim Burton,111.0,56.362352,Fantasy|Romance,Johnny Depp,Edward Scissorhands,357581,588.0,English,20.00,1990.0,7.9,16000,36.362352


In [None]:
# Add a new column called Rank
IMDB_Top_250['Rank']=IMDB_Top_250['imdb_score'].rank(method='first',ascending=False)
IMDB_Top_250

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,Rank
1937,Frank Darabont,199.0,28.341469,Crime|Drama,Morgan Freeman,The Shawshank Redemption,1689764,4144.0,English,25.00,1994.0,9.3,108000,3.341469,1.0
3466,Francis Ford Coppola,208.0,134.821952,Crime|Drama,Al Pacino,The Godfather,1155770,2238.0,English,6.00,1972.0,9.2,43000,128.821952,2.0
2837,Francis Ford Coppola,149.0,57.300000,Crime|Drama,Robert De Niro,The Godfather: Part II,790926,650.0,English,13.00,1974.0,9.0,14000,44.300000,3.0
66,Christopher Nolan,645.0,533.316061,Action|Crime|Drama|Thriller,Christian Bale,The Dark Knight,1676169,4667.0,English,185.00,2008.0,9.0,37000,348.316061,4.0
4498,Sergio Leone,181.0,6.100000,Western,Clint Eastwood,"The Good, the Bad and the Ugly",503509,780.0,Italian,1.20,1966.0,8.9,20000,4.900000,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,John Carney,232.0,9.437933,Drama|Music|Romance,Glen Hansard,Once,90827,329.0,English,0.18,2007.0,7.9,26000,9.257933,246.0
2605,Ang Lee,287.0,128.067808,Action|Drama|Romance,Chen Chang,"Crouching Tiger, Hidden Dragon",217740,1641.0,Mandarin,15.00,2000.0,7.9,0,113.067808,247.0
3029,David O. Russell,410.0,93.571803,Biography|Drama|Sport,Christian Bale,The Fighter,275869,389.0,English,25.00,2010.0,7.9,36000,68.571803,248.0
2177,Tim Burton,111.0,56.362352,Fantasy|Romance,Johnny Depp,Edward Scissorhands,357581,588.0,English,20.00,1990.0,7.9,16000,36.362352,249.0


Extract all the movies in the IMDB_Top_250 dataframe which are not in the English language and store them in a new dataframe named **Top_Foreign_lang_film**

In [None]:
Top_Foreign_lang_film=IMDB_Top_250[IMDB_Top_250['language']!='English']
Top_Foreign_lang_film

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,Rank
4498,Sergio Leone,181.0,6.1,Western,Clint Eastwood,"The Good, the Bad and the Ugly",503509,780.0,Italian,1.2,1966.0,8.9,20000,4.9,5.0
4747,Akira Kurosawa,153.0,0.269061,Action|Adventure|Drama,Takashi Shimura,Seven Samurai,229012,596.0,Japanese,2.0,1954.0,8.7,11000,-1.730939,17.0
4029,Fernando Meirelles,214.0,7.563397,Crime|Drama,Alice Braga,City of God,533200,749.0,Portuguese,3.3,2002.0,8.7,28000,4.263397,20.0
2373,Hayao Miyazaki,246.0,10.049886,Adventure|Animation|Family|Fantasy,Bunta Sugawara,Spirited Away,417971,902.0,Japanese,19.0,2001.0,8.6,28000,-8.950114,23.0
4259,Florian Henckel von Donnersmarck,215.0,11.284657,Drama|Thriller,Sebastian Koch,The Lives of Others,259379,407.0,German,2.0,2006.0,8.5,39000,9.284657,35.0
4921,Majid Majidi,46.0,0.925402,Drama|Family,Bahare Seddiqi,Children of Heaven,27882,130.0,Persian,0.18,1997.0,8.5,0,0.745402,39.0
2323,Hayao Miyazaki,174.0,2.298191,Adventure|Animation|Fantasy,Minnie Driver,Princess Mononoke,221552,570.0,Japanese,2400.0,1997.0,8.4,11000,-2397.701809,47.0
2970,Wolfgang Petersen,96.0,11.433134,Adventure|Drama|Thriller|War,Jürgen Prochnow,Das Boot,168203,426.0,German,14.0,1981.0,8.4,11000,-2.566866,49.0
4105,Chan-wook Park,305.0,2.18129,Drama|Mystery|Thriller,Min-sik Choi,Oldboy,356181,809.0,Korean,3.0,2003.0,8.4,43000,-0.81871,57.0
4659,Asghar Farhadi,354.0,7.098492,Drama|Mystery,Shahab Hosseini,A Separation,151812,264.0,Persian,0.5,2011.0,8.4,48000,6.598492,58.0


### **Find the best directors**

In [None]:
# Extracting top 10 directors
top10_directors = df_movies.groupby('director_name').imdb_score.mean().sort_values(ascending=False).head(10)
top10_directors

director_name
Charles Chaplin          8.600000
Tony Kaye                8.600000
Alfred Hitchcock         8.500000
Ron Fricke               8.500000
Damien Chazelle          8.500000
Majid Majidi             8.500000
Sergio Leone             8.433333
Christopher Nolan        8.425000
S.S. Rajamouli           8.400000
Marius A. Markevicius    8.400000
Name: imdb_score, dtype: float64

### Find popular Genres

Extract first two genres from the genres column and store them in two new columns genre_1 and genre_2 . Some of the movies might have only one genre. In such cases, extract the single genre into both the columns.

In [None]:
Temp_genre=df_movies.genres.str.split('|',expand=True).iloc[:,0:2]
Temp_genre.columns=['genre_1','genre_2']   # Set label to columns
Temp_genre

Unnamed: 0,genre_1,genre_2
0,Action,Adventure
1,Action,Adventure
2,Action,Adventure
3,Action,Thriller
5,Action,Adventure
...,...,...
5033,Drama,Sci-Fi
5034,Thriller,
5035,Action,Crime
5037,Comedy,Drama


**Observations:** Here genre_2 column have some null values, so fill those values with genre_1 values.

In [None]:
Temp_genre['genre_2'].fillna(Temp_genre['genre_1'],inplace=True)
Temp_genre

Unnamed: 0,genre_1,genre_2
0,Action,Adventure
1,Action,Adventure
2,Action,Adventure
3,Action,Thriller
5,Action,Adventure
...,...,...
5033,Drama,Sci-Fi
5034,Thriller,Thriller
5035,Action,Crime
5037,Comedy,Drama


In [None]:
# Add genres columns to movies dataframe
df_movies=pd.concat([df_movies,Temp_genre],axis=1)
df_movies

Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
0,James Cameron,723.0,760.505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,3054.0,English,237.0000,2009.0,7.9,33000,523.505847,Action,Adventure
1,Gore Verbinski,302.0,309.404152,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,1238.0,English,300.0000,2007.0,7.1,0,9.404152,Action,Adventure
2,Sam Mendes,602.0,200.074175,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,994.0,English,245.0000,2015.0,6.8,85000,-44.925825,Action,Adventure
3,Christopher Nolan,813.0,448.130642,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,2701.0,English,250.0000,2012.0,8.5,164000,198.130642,Action,Thriller
5,Andrew Stanton,462.0,73.058679,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,738.0,English,263.7000,2012.0,6.6,24000,-190.641321,Action,Adventure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5033,Shane Carruth,143.0,0.424760,Drama|Sci-Fi|Thriller,Shane Carruth,Primer,72639,371.0,English,0.0070,2004.0,7.0,19000,0.417760,Drama,Sci-Fi
5034,Neill Dela Llana,35.0,0.070071,Thriller,Ian Gamazon,Cavite,589,35.0,English,0.0070,2005.0,6.3,74,0.063071,Thriller,Thriller
5035,Robert Rodriguez,56.0,2.040920,Action|Crime|Drama|Romance|Thriller,Carlos Gallardo,El Mariachi,52055,130.0,Spanish,0.0070,1992.0,6.9,0,2.033920,Action,Crime
5037,Edward Burns,14.0,0.004584,Comedy|Drama,Kerry Bishé,Newlyweds,1338,14.0,English,0.0090,2011.0,6.4,413,-0.004416,Comedy,Drama


**Group the dataframe using genres_1 as the primary column and genres_2 as the secondary column**

In [None]:
df_movies.groupby(['genre_1','genre_2']).gross.mean().sort_values(ascending=False).head(5)

genre_1    genre_2  
Family     Sci-Fi       434.949459
Adventure  Sci-Fi       228.627758
           Family       118.919540
           Animation    116.998550
Action     Adventure    109.595465
Name: gross, dtype: float64

**Observations:** Here family+ sci-Fi is most popular combo genres out there means it collect more amount.

### **Find the critic-favourite and audience-favourite actors**

In [None]:
df_movies['actor_1_name'].value_counts()

Robert De Niro               42
Johnny Depp                  38
J.K. Simmons                 31
Denzel Washington            30
Nicolas Cage                 30
                             ..
Martin Dew                    1
Chriss Anglin                 1
Kevin Alejandro               1
Catherine Lough Haggquist     1
John August                   1
Name: actor_1_name, Length: 1509, dtype: int64

In [None]:
Meryl_Streep=df_movies[df_movies['actor_1_name']=='Meryl Streep']
Leonardo_DiCaprio=df_movies[df_movies['actor_1_name']=='Leonardo DiCaprio']
Brad_Pitt=df_movies[df_movies['actor_1_name']=='Brad Pitt']

In [None]:
combined=Brad_Pitt.append([Meryl_Streep,Leonardo_DiCaprio])
combined

  combined=Brad_Pitt.append([Meryl_Streep,Leonardo_DiCaprio])


Unnamed: 0,director_name,num_critic_for_reviews,gross,genres,actor_1_name,movie_title,num_voted_users,num_user_for_reviews,language,budget,title_year,imdb_score,movie_facebook_likes,profit,genre_1,genre_2
101,David Fincher,362.0,127.490802,Drama|Fantasy|Romance,Brad Pitt,The Curious Case of Benjamin Button,459346,822.0,English,150.0,2008.0,7.8,23000,-22.509198,Drama,Fantasy
147,Wolfgang Petersen,220.0,133.228348,Adventure,Brad Pitt,Troy,381672,1694.0,English,175.0,2004.0,7.2,0,-41.771652,Adventure,Adventure
254,Steven Soderbergh,198.0,125.531634,Crime|Thriller,Brad Pitt,Ocean's Twelve,284852,627.0,English,110.0,2004.0,6.4,0,15.531634,Crime,Thriller
255,Doug Liman,233.0,186.336103,Action|Comedy|Crime|Romance|Thriller,Brad Pitt,Mr. & Mrs. Smith,348861,798.0,English,120.0,2005.0,6.5,0,66.336103,Action,Comedy
382,Tony Scott,142.0,0.026871,Action|Crime|Thriller,Brad Pitt,Spy Game,121259,361.0,English,92.0,2001.0,7.0,0,-91.973129,Action,Crime
400,Steven Soderbergh,186.0,183.405771,Crime|Thriller,Brad Pitt,Ocean's Eleven,402645,845.0,English,85.0,2001.0,7.8,0,98.405771,Crime,Thriller
470,David Ayer,406.0,85.707116,Action|Drama|War,Brad Pitt,Fury,303185,701.0,English,68.0,2014.0,7.6,82000,17.707116,Action,Drama
611,Jean-Jacques Annaud,76.0,37.901509,Adventure|Biography|Drama|History|War,Brad Pitt,Seven Years in Tibet,96385,119.0,English,70.0,1997.0,7.0,0,-32.098491,Adventure,Biography
683,David Fincher,315.0,37.023395,Drama,Brad Pitt,Fight Club,1347461,2968.0,English,63.0,1999.0,8.8,48000,-25.976605,Drama,Drama
792,Patrick Gilmore,98.0,26.28832,Adventure|Animation|Comedy|Drama|Family|Fantas...,Brad Pitt,Sinbad: Legend of the Seven Seas,36144,91.0,English,60.0,2003.0,6.7,880,-33.71168,Adventure,Animation


In [None]:
combined.groupby('actor_1_name').num_critic_for_reviews.mean()

actor_1_name
Brad Pitt            245.000000
Leonardo DiCaprio    330.190476
Meryl Streep         181.454545
Name: num_critic_for_reviews, dtype: float64

In [None]:
combined.groupby('actor_1_name').num_user_for_reviews.mean()

actor_1_name
Brad Pitt            742.352941
Leonardo DiCaprio    914.476190
Meryl Streep         297.181818
Name: num_user_for_reviews, dtype: float64

In [None]:
# Alternate way to find mean for both columns
combined.groupby('actor_1_name')[['num_user_for_reviews','num_critic_for_reviews']].mean()

Unnamed: 0_level_0,num_user_for_reviews,num_critic_for_reviews
actor_1_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Brad Pitt,742.352941,245.0
Leonardo DiCaprio,914.47619,330.190476
Meryl Streep,297.181818,181.454545


**Observations:** Above from two lists Leonardo DiCapriogot high reviews.

https://www.youtube.com/watch?v=lFGtBC2MG8Y  -Reference link