In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
from itertools import cycle
import re

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

# Loading the Dataset

In [2]:
df = pd.read_csv("data/titles.csv")

# Quick Look into the dataset

In [3]:
df.head(1)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,"This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discussed in the docuseries ""Five Came Back.""",1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5806 entries, 0 to 5805
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5806 non-null   object 
 1   title                 5805 non-null   object 
 2   type                  5806 non-null   object 
 3   description           5788 non-null   object 
 4   release_year          5806 non-null   int64  
 5   age_certification     3196 non-null   object 
 6   runtime               5806 non-null   int64  
 7   genres                5806 non-null   object 
 8   production_countries  5806 non-null   object 
 9   seasons               2047 non-null   float64
 10  imdb_id               5362 non-null   object 
 11  imdb_score            5283 non-null   float64
 12  imdb_votes            5267 non-null   float64
 13  tmdb_popularity       5712 non-null   float64
 14  tmdb_score            5488 non-null   float64
dtypes: float64(5), int64(

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
release_year,5806.0,2016.013434,7.324883,1945.0,2015.0,2018.0,2020.0,2022.0
runtime,5806.0,77.643989,39.47416,0.0,44.0,84.0,105.0,251.0
seasons,2047.0,2.165608,2.636207,1.0,1.0,1.0,2.0,42.0
imdb_score,5283.0,6.533447,1.160932,1.5,5.8,6.6,7.4,9.6
imdb_votes,5267.0,23407.194988,87134.315849,5.0,521.0,2279.0,10144.0,2268288.0
tmdb_popularity,5712.0,22.52566,68.849177,0.009442,3.15525,7.478,17.77575,1823.374
tmdb_score,5488.0,6.818039,1.17156,0.5,6.1,6.9,7.5,10.0


# Basic Information

### Distribution of All the Features

In [6]:
palette = cycle(px.colors.sequential.thermal)

fig = sp.make_subplots(
    rows=3, cols=3,
    subplot_titles=[
        "Yearwise Release Count",
        "Runtime",
        "IMDB Votes",
        "IMDB Rating",
        "TMDB Popularity",
        "TMDB Score",
        "Seasons",
        "Age Certification",
        "Movie Or Show"],
    specs=[[{"type": "histogram"}, {"type": "histogram"}, {"type": "histogram"}],
           [{"type": "histogram"}, {"type": "histogram"}, {"type": "histogram"}],
           [{"type": "histogram"}, {"type": "pie"}, {"type": "pie"}]]
)

release_year = go.Histogram(
    x=df.release_year,
    name="Release Year",
    marker_color=next(palette),
    legendgroup="Release Year",
    legendgrouptitle_text="Release Year",
)

runtime = go.Histogram(
    x=df.runtime,
    nbinsx=int(df.__len__()/50),
    name="Runtime",
    marker_color=next(palette),
    legendgroup="Runtime",
    legendgrouptitle_text="Runtime",
)

imdb_votes = go.Histogram(
    x=df.imdb_votes,
    nbinsx=int(df.__len__()/50),
    name="IMDB Votes",
    marker_color=next(palette),
    legendgroup="IMDB Votes",
    legendgrouptitle_text="IMDB Votes",
)

imdb_score = go.Histogram(
    x=df.imdb_score,
    nbinsx=10,
    name="IMDB Score",
    marker_color=next(palette),
    legendgroup="IMDB Score",
    legendgrouptitle_text="IMDB Score",
)

tmdb_popularity = go.Histogram(
    x=df.tmdb_popularity,
    name="TMDB Popularity",
    nbinsx=int(df.__len__()/50),
    marker_color=next(palette),
    legendgroup="TMDB Popularity",
    legendgrouptitle_text="TMDB Popularity",
)

tmdb_score = go.Histogram(
    x=df.tmdb_score,
    name="TMDB Score",
    nbinsx=10,
    marker_color=next(palette),
    legendgroup="TMDB Score",
    legendgrouptitle_text="TMDB Score",
)

seasons = go.Histogram(
    x=df.seasons,
    name="Seasons",
    marker_color=next(palette),
    legendgroup="Seasons",
    legendgrouptitle_text="Seasons",
)

age_certification_counts = df.age_certification.value_counts()
age_certification_counts["Not Available"] = df.age_certification.isna().sum()
age_certification_dict = age_certification_counts.to_dict()

age_certification = go.Pie(
    labels=list(age_certification_dict.keys()),
    values=list(age_certification_dict.values()),
    name="Age Certification",
    hoverinfo="label+value+percent",
    marker_colors=[next(palette) for i in range(len(age_certification_dict))],
    legendgroup="Age Certification",
    legendgrouptitle_text="Age Certification",
)

type_counts = df.type.value_counts().to_dict()

type_ = go.Pie(
    labels=list(type_counts.keys()),
    values=list(type_counts.values()),
    name="Type",
    hoverinfo="label+value+percent",
    marker_colors=[next(palette) for i in range(len(type_counts))],
    legendgroup="Type",
    legendgrouptitle_text="Type",
)

fig.add_trace(release_year, row=1, col=1)
fig.update_xaxes(title_text="Release Year", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.add_trace(runtime, row=1, col=2)
fig.update_xaxes(title_text="#Runtime", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=2)

fig.add_trace(imdb_votes, row=1, col=3)
fig.update_xaxes(title_text="No. of IMDB Votes", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=3)

fig.add_trace(imdb_score, row=2, col=1)
fig.update_xaxes(title_text="#IMDB Score", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.add_trace(tmdb_popularity, row=2, col=2)
fig.update_xaxes(title_text="#TMDB Popularity", row=2, col=2)
fig.update_yaxes(title_text="Count", row=2, col=2)

fig.add_trace(tmdb_score, row=2, col=3)
fig.update_xaxes(title_text="#TMDB Score", row=2, col=3)
fig.update_yaxes(title_text="Count", row=2, col=3)  

fig.add_trace(seasons, row=3, col=1)
fig.update_xaxes(title_text="No. of Seasons", row=3, col=1)
fig.update_yaxes(title_text="Count", row=3, col=1)

fig.add_trace(age_certification, row=3, col=2)

fig.add_trace(type_, row=3, col=3)

fig.update_annotations(font_size=23)

fig.update_layout(
    template="plotly",
    height=1400,
)

fig.update(
    layout_title_text="Distribution of Characteristics of Movies and Series",
    layout_title_font_size=30,
    layout_title_x=0.5,
    layout_paper_bgcolor='rgb(229, 237, 247)',
    layout_plot_bgcolor='rgb(229, 237, 247)',
)

fig.show()


### Box Plot of All the Features

In [7]:
palette = cycle(px.colors.qualitative.Dark2_r)

fig = sp.make_subplots(
    rows=2, cols=3,
    subplot_titles=["Runtime",
                    "Seasons",
                    "IMDB Score",
                    "IMDB Votes",
                    "TMDB Popularity",
                    "TMDB Score",],
    specs=[[{"type": "box"}, {"type": "box"}, {"type": "box"}],
           [{"type": "box"}, {"type": "box"}, {"type": "box"}]],
)

runtime_box = go.Box(
    y=df.runtime,
    name="Runtime",
    marker_color=next(palette),
)

seasons_box = go.Box(
    y=df.seasons,
    name="Seasons",
    marker_color=next(palette),
)

imdb_score_box = go.Box(
    y=df.imdb_score,
    name="IMDB Score",
    marker_color=next(palette),
)

imdb_votes_box = go.Box(
    y=df.imdb_votes,
    name="IMDB Votes",
    marker_color=next(palette),
)

tmdb_popularity_box = go.Box(
    y=df.tmdb_popularity,
    name="TMDB Popularity",
    marker_color=next(palette),
)

tmdb_score_box = go.Box(
    y=df.tmdb_score,
    name="TMDB Score",
    marker_color=next(palette),
)

fig.add_trace(runtime_box, row=1, col=1)
fig.update_xaxes(title_text="Runtime", row=1, col=1)

fig.add_trace(seasons_box, row=1, col=2)
fig.update_xaxes(title_text="No. of Seasons", row=1, col=2)

fig.add_trace(imdb_score_box, row=1, col=3)
fig.update_xaxes(title_text="IMDB Score", row=1, col=3)

fig.add_trace(imdb_votes_box, row=2, col=1)
fig.update_xaxes(title_text="No. of IMDB Votes", row=2, col=1)

fig.add_trace(tmdb_popularity_box, row=2, col=2)
fig.update_xaxes(title_text="TMDB Popularity", row=2, col=2)

fig.add_trace(tmdb_score_box, row=2, col=3)
fig.update_xaxes(title_text="TMDB Score", row=2, col=3)


fig.update_layout(template="plotly", height=1080,)
fig.update_annotations(font_size=23)

fig.update(
    layout_title_text="Box Plots of Characteristics of Movies and Series",
    layout_title_font_size=30,
    layout_title_x=0.5,
    layout_paper_bgcolor='rgb(229, 237, 247)',
    layout_plot_bgcolor='rgb(229, 237, 247)',
)

fig.show()

# Feature Generation

Generating Feature-Columns for every genre

In [8]:
df["genres"] = df["genres"].apply(lambda x: re.findall("\w+", x))

genres = list(df["genres"].values)
genres = list(set([item for sublist in genres for item in sublist]))

for i, genre in enumerate(genres):
    df[genre] = df.genres.apply(lambda x: 1 if genre in x else 0).astype(int)

print("Number of Genres: ", len(genres))
print("Genres:", genres)

Number of Genres:  19
Genres: ['family', 'thriller', 'european', 'music', 'fantasy', 'scifi', 'war', 'action', 'history', 'reality', 'animation', 'horror', 'sport', 'documentation', 'romance', 'western', 'comedy', 'crime', 'drama']


### Genre Distribution

#### Based on Count

In [9]:
genre_movie_dict = {}

for genre in genres:
    genre_movie_dict[genre] = df.query("type == 'MOVIE'")[genre].sum()

genre_movie_dict = dict(sorted(genre_movie_dict.items(), key=lambda x: x[0]))

genre_series_dict = {}

for genre in genres:
    genre_series_dict[genre] = df.query("type == 'SHOW'")[genre].sum()

genre_series_dict = dict(sorted(genre_series_dict.items(), key=lambda x: x[0]))

fig = sp.make_subplots(
    rows=2,
    cols=1,
    subplot_titles=["Movies", "Series"],
)

genre_movie_count = go.Bar(
    x=list(genre_movie_dict.keys()),
    y=list(genre_movie_dict.values()),
    marker=dict(color=list(genre_movie_dict.values()),
                colorscale=px.colors.qualitative.Dark2),
    name="Movies",
)

genre_series_count = go.Bar(
    x=list(genre_series_dict.keys()),
    y=list(genre_series_dict.values()),
    marker=dict(color=list(genre_series_dict.values()),
                colorscale=px.colors.qualitative.Dark2),
    name="Series",
)

fig.add_trace(genre_movie_count, row=1, col=1)
fig.update_xaxes(title_text="Genres", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.add_trace(genre_series_count, row=2, col=1)
fig.update_xaxes(title_text="Genres", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

fig.update(
    layout_title_text="Genre Distribution based on No. of Movies and Shows",
    layout_title_font_size=30,
    layout_title_x=0.5,
    layout_template="plotly",
    layout_showlegend=False,
    layout_height=800,
    layout_paper_bgcolor='rgb(229, 237, 247)',
    layout_plot_bgcolor='rgb(229, 237, 247)',
)

fig.update_annotations(font_size=18)

fig.show()


#### Based on IMDB Votes

In [10]:
genre_movies_popularity_dict = {}

for i, genre in enumerate(genres):
    genre_movies_popularity_dict[genre] = df.query("type == 'MOVIE'").groupby(genre)["imdb_votes"].sum().sort_index().__getitem__(1)

genre_movies_popularity_dict = dict(sorted(genre_movies_popularity_dict.items(), key=lambda x: x[0]))

genre_series_popularity_dict = {}

for i, genre in enumerate(genres):
    genre_series_popularity_dict[genre] = df.query("type == 'SHOW'").groupby(genre)["imdb_votes"].sum().sort_index().__getitem__(1)

genre_series_popularity_dict = dict(sorted(genre_series_popularity_dict.items(), key=lambda x: x[0]))

fig = sp.make_subplots(
    rows=2, 
    cols=1,
    subplot_titles=["Movies", "Series"],
)

genre_movies_pop = go.Bar(
    x=list(genre_movies_popularity_dict.keys()),
    y=list(genre_movies_popularity_dict.values()),
    marker=dict(color=list(genre_movies_popularity_dict.values()),
                colorscale=px.colors.qualitative.Dark2),
    hoverinfo="x+y",
)

genre_series_pop = go.Bar(
    x=list(genre_series_popularity_dict.keys()),
    y=list(genre_series_popularity_dict.values()),
    marker=dict(color=list(genre_series_popularity_dict.values()),
                colorscale=px.colors.qualitative.Dark2),
    hoverinfo="x+y",
)

fig.add_trace(genre_movies_pop, row=1, col=1)
fig.update_xaxes(title_text="Genre", row=1, col=1)
fig.update_yaxes(title_text="IMDB Votes", row=1, col=1)
fig.update

fig.add_trace(genre_series_pop, row=2, col=1)
fig.update_xaxes(title_text="Genre", row=2, col=1)
fig.update_yaxes(title_text="IMDB Votes", row=2, col=1)

fig.update(
    layout_title_text="Genre Distribution based on IMDB Votes",
    layout_title_font_size=30,
    layout_title_x=0.5,
    layout_template="plotly",
    layout_showlegend=False,
    layout_height=800,
    layout_paper_bgcolor='rgb(229, 237, 247)',
    layout_plot_bgcolor='rgb(229, 237, 247)',
)

fig.update_annotations(font_size=18)

fig.show()

In [11]:
palette = cycle(px.colors.qualitative.Dark2)

fig = go.Figure()

for i, genre in enumerate(sorted(genres)):
    temp = df[df[genre] == 1]
    
    fig.add_trace(
        go.Box(
            y=temp['imdb_score'],
            name=genre,
            marker_color=next(palette),
            marker_size=5,
            line_width=1,
            hovertemplate="<b>%{y:.2f}</b>"+f"<br>{genre}<br>Count-{len(temp)}<extra></extra>",
        )
    )

fig.update_layout(
    title="IMDB Score Box Distribution by Genre",
    title_font_size=30,
    title_x=0.5,
    yaxis_title="IMDB Score",
    xaxis_title="Genre",
    template="plotly",
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    showlegend=False,
    paper_bgcolor='rgb(229, 237, 247)',
    plot_bgcolor='rgb(229, 237, 247)',
)

fig.show()

In [12]:
df[(df['drama'] == 1) & (df['imdb_score'].notna())]['imdb_score']

1       8.3
7       7.8
8       5.8
9       7.7
11      7.5
       ... 
5792    4.9
5795    6.2
5798    2.2
5801    6.9
5803    6.5
Name: imdb_score, Length: 2773, dtype: float64

In [13]:
fig = ff.create_distplot(
    [df[(df[genre] == 1) & (df['imdb_score'].notna())]['imdb_score'] for genre in sorted(genres)],
    sorted(genres),
    show_hist=False,
    show_rug=False,
)

fig.update_layout(
    title="IMDB Score Distribution by Genre",
    title_font_size=30,
    title_x=0.5,
    xaxis_title="IMDB Score",
    template="plotly",
    paper_bgcolor='rgb(229, 237, 247)',
    plot_bgcolor='rgb(229, 237, 247)',
    legend_title="Genre",
)

fig.show()

## Production Countries

In [14]:
df["production_countries"] = df["production_countries"].apply(lambda x: re.findall("\w+", x))

df["production_countries"] = df["production_countries"].apply(lambda x: ["LB"] if str(x).__contains__("Lebanon") else x)

df["production_countries"] = df["production_countries"].apply(lambda x: [] if str(x).__contains__("XX") else x)

production_countries = list(df["production_countries"].values)
production_countries = list(set([item for sublist in production_countries for item in sublist]))

df["main_production_country_alpha_2"] = df.production_countries.apply(lambda x: x[0] if x else None)

country_alpha = pd.read_json("data/countries.json")[["name", "alpha_2", "alpha_3"]]
country_alpha = dict(zip(country_alpha.alpha_2, country_alpha.alpha_3))

df["main_production_country_alpha_3"] = df.main_production_country_alpha_2.apply(lambda x: country_alpha[x] if x else None)

fig = go.Figure()

map_plot = go.Choropleth(
    locations=df.main_production_country_alpha_3.value_counts().index, 
    z=df.main_production_country_alpha_3.value_counts().values,
    colorscale="Reds",
    autocolorscale=False,
    colorbar_title="No. of Movies",
)

fig.update_layout(
    title_text="Production Countries",
    title_font_size=30,
    height=680,
    title_x=0.5,
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
)

fig.add_trace(map_plot)

## IMDB Score Category

In [15]:
distrib = df['imdb_score'].map(lambda x: f"{int(np.nan_to_num(x)*10//10)}-{int((np.nan_to_num(x)*10//10)+1)}")
df.insert(12, 'imdb_score_range', distrib)

score_range_dict = {}

for i, genre in enumerate(sorted(genres)):
    score_range_dict[genre] = df.groupby(genre)['imdb_score_range'].value_counts().__getitem__(1).to_dict()


In [17]:
fig = sp.make_subplots(
    rows=4, 
    cols=5,
    subplot_titles=sorted(genres),
    specs=[[{'type': 'table'}]*5]*4,
    horizontal_spacing=0.01,
    vertical_spacing=0.05,
)

for i, (key, value) in enumerate(score_range_dict.items()):
    fig.append_trace(
        go.Table(
            header=dict(
                values=["IMDB Score Range", "Count"],
                align="center",
            ),
            cells=dict(
                values=[list(value.keys()), list(value.values())],
                align="center",  
            )
        ),
        row=i%4+1,
        col=i%5+1,   
    )

fig.update_layout(
    title_text="IMDB Score Distribution by Genre",
    title_font_size=30,
    title_x=0.5,
    height=1000,
    autosize=True,
)
fig.show()

## Questionnaire

#### 1. TOP 5 Movies with High IMDB Votes and Score

In [18]:
df.query("type == 'MOVIE'")[['release_year',
                             'title',
                             'type',
                             'runtime',
                             'imdb_score',
                             'imdb_votes',
                             'genres']]\
    .sort_values(by=["imdb_votes", "imdb_score"],
                 ascending=False)\
    .head(5)\
    .reset_index(drop=True)


Unnamed: 0,release_year,title,type,runtime,imdb_score,imdb_votes,genres
0,2010,Inception,MOVIE,148,8.8,2268288.0,"[scifi, music, thriller, action]"
1,1994,Forrest Gump,MOVIE,142,8.8,1994599.0,"[drama, romance, comedy]"
2,2012,Django Unchained,MOVIE,165,8.4,1472668.0,"[western, drama]"
3,1998,Saving Private Ryan,MOVIE,169,8.6,1346020.0,"[drama, war]"
4,1976,Taxi Driver,MOVIE,113,8.3,795222.0,"[crime, drama]"


#### 2. TOP 5 Series with High IMDB Votes and Score

In [19]:
df.query("type == 'SHOW'")[['release_year',
                            'title',
                            'type',
                            'runtime',
                            'imdb_score',
                            'imdb_votes',
                            'genres',
                            ]]\
    .sort_values(by=["imdb_votes", "imdb_score"],
                 ascending=False)\
    .head(5)\
    .reset_index(drop=True)


Unnamed: 0,release_year,title,type,runtime,imdb_score,imdb_votes,genres
0,2008,Breaking Bad,SHOW,48,9.5,1727694.0,"[drama, thriller, crime]"
1,2016,Stranger Things,SHOW,52,8.7,989090.0,"[scifi, drama, fantasy, horror, thriller]"
2,2010,The Walking Dead,SHOW,46,8.2,945125.0,"[action, drama, scifi, thriller, horror]"
3,2011,Black Mirror,SHOW,59,8.8,515577.0,"[scifi, thriller, drama, european]"
4,2013,House of Cards,SHOW,52,8.7,494092.0,[drama]


#### 3. Best PG TV Show in terms of both IMDB Votes and Score produced in US

In [20]:
df.query("type == 'SHOW' and main_production_country_alpha_3 == 'USA' and age_certification == 'TV-PG'")\
    .sort_values(by=["imdb_votes", "imdb_score"], ascending=False)\
    .reset_index(drop=True)\
    .head(1)\
    [["id", "title", "release_year", "genres", "seasons", "runtime", "imdb_score", "imdb_votes"]]

Unnamed: 0,id,title,release_year,genres,seasons,runtime,imdb_score,imdb_votes
0,ts20681,Seinfeld,1989,[comedy],9.0,24,8.9,302700.0


#### 4. Best Movie or TV Show for every Genre in terms of both IMDB Votes and Score

In [21]:
best_by_genre = pd.DataFrame(columns=df.columns.tolist() + ["selected_genre"])

for i, genre in enumerate(sorted(genres)):
    best_genre_data = df.query(f"{genre} == 1").sort_values(by=["imdb_votes", "imdb_score"], ascending=False).reset_index().head(1)
    best_genre_data["selected_genre"] = genre
    
    best_by_genre = pd.concat([best_by_genre, best_genre_data], ignore_index=True).reset_index(drop=True)

best_by_genre[['release_year', 'title', 'selected_genre', 'imdb_score']]

Unnamed: 0,release_year,title,selected_genre,imdb_score
0,2010,Inception,action,8.8
1,2010,How to Train Your Dragon,animation,8.1
2,1994,Forrest Gump,comedy,8.8
3,2008,Breaking Bad,crime,9.5
4,2002,Road to Perdition,documentation,7.7
5,1994,Forrest Gump,drama,8.8
6,2006,Casino Royale,european,8.0
7,2010,How to Train Your Dragon,family,8.1
8,2016,Stranger Things,fantasy,8.7
9,2017,Dunkirk,history,7.8


#### 5. List All the US Best TV Show yearwise with the highest IMDB Score

In [22]:
gb = df.query("type == 'SHOW'").sort_values(by=["release_year", "imdb_score"], ascending=[True, False]).groupby("release_year")
gb.first()[["title", "imdb_score"]]

Unnamed: 0_level_0,title,imdb_score
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1945,Five Came Back: The Reference Films,
1969,Monty Python's Flying Circus,8.8
1972,Monty Python's Fliegender Zirkus,8.1
1981,Danger Mouse,7.4
1982,Knight Rider,6.9
1983,Wheel of Fortune,6.7
1984,Thomas & Friends,6.5
1987,Fireman Sam,6.1
1988,High Risk,3.8
1989,Seinfeld,8.9


#### 6. Top 5 Thriller and Drama Indian Movies to watch

In [23]:
df[(df['drama'] == 1) & 
   (df['thriller'] == 1) & 
   (df['main_production_country_alpha_3'] == 'IND')
   ].sort_values(by=["imdb_score"], ascending=False)\
    .reset_index(drop=True)\
    .head(5)\
    .loc[:, ["title", "release_year", "imdb_score"]]

Unnamed: 0,title,release_year,imdb_score
0,Sacred Games,2018,8.6
1,Super Deluxe,2019,8.4
2,Single Slipper Size - 7,2019,8.4
3,Article 15,2019,8.2
4,Andhadhun,2018,8.2


#### 7. Top 5 Action and Comedy British Series to watch

In [24]:
df[(df['action'] == 1) &
   (df['comedy'] == 1) &
   (df['main_production_country_alpha_3'] == 'GBR')
   ]\
   .sort_values(by=["imdb_score"], ascending=False)\
   .reset_index(drop=True)\
   .head(5)\
   .loc[:, ["title", "release_year", "imdb_score"]]

Unnamed: 0,title,release_year,imdb_score
0,Octonauts,2010,7.6
1,Danger Mouse,2015,7.2
2,Sugar Rush,2019,6.7
3,Thomas & Friends,1984,6.5
4,David Brent: Life on the Road,2016,6.3


#### 8. List the countries which have produced less than 5 movies or shows

In [25]:
total_produced_countries = df.groupby("main_production_country_alpha_3")['title'].count().to_dict()
country_alpha = pd.read_json("data/countries.json")[["name", "alpha_2", "alpha_3"]]
country_alpha3_name = country_alpha.set_index("alpha_3")["name"].to_dict()

count = 0

for (key, value) in total_produced_countries.items():
    if value < 5:
        count += 1
        if count % 4 == 0:
            print(f"[{count}] {country_alpha3_name[key]} ({key})", end="\n")
        else:
            print(f"[{count}] {country_alpha3_name[key]} ({key})", end="\t")

[1] Afghanistan (AFG)	[2] Angola (AGO)	[3] Bangladesh (BGD)	[4] Belarus (BLR)
[5] Switzerland (CHE)	[6] Cameroon (CMR)	[7] Congo (the Democratic Republic of the) (COD)	[8] Cuba (CUB)
[9] Algeria (DZA)	[10] Finland (FIN)	[11] Georgia (GEO)	[12] Ghana (GHA)
[13] Greenland (GRL)	[14] Guatemala (GTM)	[15] Croatia (HRV)	[16] Hungary (HUN)
[17] British Indian Ocean Territory (IOT)	[18] Iraq (IRQ)	[19] Jordan (JOR)	[20] Kenya (KEN)
[21] Kyrgyzstan (KGZ)	[22] Cambodia (KHM)	[23] Lithuania (LTU)	[24] Luxembourg (LUX)
[25] Morocco (MAR)	[26] Mozambique (MOZ)	[27] Mauritius (MUS)	[28] Malawi (MWI)
[29] Namibia (NAM)	[30] Pakistan (PAK)	[31] Puerto Rico (PRI)	[32] Portugal (PRT)
[33] Paraguay (PRY)	[34] Senegal (SEN)	[35] Serbia (SRB)	[36] Slovakia (SVK)
[37] Syria (SYR)	[38] Tanzania, the United Republic of (TZA)	[39] Ukraine (UKR)	[40] Uruguay (URY)
[41] Venezuela (Bolivarian Republic of) (VEN)	[42] Viet Nam (VNM)	[43] Zimbabwe (ZWE)	

#### 9. The longest movie ever made in terms of duration for every genre

In [42]:
longest_runtime = pd.DataFrame(columns=df.columns.tolist() + ["selected_genre"])

for i, genre in enumerate(sorted(genres)):
    temp = df[df[genre] == 1].sort_values(by=['runtime'], ascending=False).reset_index(drop=True)
    first = temp.groupby(genre).first()
    first["selected_genre"] = genre
    longest_runtime = pd.concat([longest_runtime, first], ignore_index=True).reset_index(drop=True)

longest_runtime[['title', 'release_year', 'runtime', 'selected_genre']]

Unnamed: 0,title,release_year,runtime,selected_genre
0,Jodhaa Akbar,2008,213,action
1,Mobile Suit Gundam III: Encounters in Space,1982,144,animation
2,The School of Mischief,1973,251,comedy
3,Bonnie & Clyde,2013,240,crime
4,A Lion in the House,2006,225,documentation
5,Bonnie & Clyde,2013,240,drama
6,Bonnie & Clyde,2013,240,european
7,4K Fireplace,2015,181,family
8,Hum Saath Saath Hain,1999,177,fantasy
9,Jodhaa Akbar,2008,213,history


#### 10. Longest Series ever made in terms of seasons

In [48]:
df.sort_values(by=['seasons'], ascending=False)\
    .reset_index(drop=True)\
    .head(5)\
    .loc[:, ["title", "release_year", "seasons", "genres"]]

Unnamed: 0,title,release_year,seasons,genres
0,Survivor,2000,42.0,[reality]
1,Wheel of Fortune,1983,39.0,[family]
2,The Challenge,1998,37.0,"[reality, comedy, drama, scifi]"
3,Power Rangers,1993,29.0,"[scifi, action, fantasy, family]"
4,Pokémon,1997,24.0,"[scifi, action, comedy, fantasy, animation, family]"
