# Movie Ratings

Ratings and reviews for 15,000+ movies reviewed by Rotten Tomatoes. Each record includes film details (title, description, rating, genre, directors, cast, release date, studio, runtime, etc.), as well as featured reviews, "Tomatometer" status, and audience ratings.

Logo: [Image Source](https://en.m.wikipedia.org/wiki/File:Rotten_Tomatoes_positive_audience.svg)

**Recommended Analysis**
1. What does the distribution of films look like by rating? By primary genre? (hint: use first genre listed)
2. What % of films received a Certified Fresh Tomatometer rating? What about Rotten?
3. Explore new film releases over time. How has the volume of releases by month trended over time? What year/month were the most new films released?
4. Compare average Tomatometer ratings by Studio. Which studios produce the highest rated films, on average? The lowest?
5. Compare the Tomatometer ratings against audience ratings. Which films showed the largest discrepancies between audiences and critics?
6. Explore the critics concensus rating: what language is used most often?

_Import packages and dataset, as well as constant variables_

In [104]:
# Import packages ------------------------------
import pandas as pd
import numpy as np
import string
import datetime as dt
from langdetect import detect, detect_langs, LangDetectException
# import plotly.io as pio
# pio.renderers.default = "png"
import plotly.express as px
import plotly.graph_objects as go

# Instantiate constant variables --------------------
dataset_filename = "Rotten Tomatoes Movies.csv";
tomatometer_ratings_by_studio_filename = "Tomatometer_Studio.csv";
tomatometer_vs_audience_filename = "Tomatometer_Audience.csv";

# Import dataset ---------------------------------------
df = pd.read_csv(dataset_filename);
print(df.info());

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16638 entries, 0 to 16637
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_title         16638 non-null  object 
 1   movie_info          16614 non-null  object 
 2   critics_consensus   8309 non-null   object 
 3   rating              16638 non-null  object 
 4   genre               16621 non-null  object 
 5   directors           16524 non-null  object 
 6   writers             15289 non-null  object 
 7   cast                16354 non-null  object 
 8   in_theaters_date    15823 non-null  object 
 9   on_streaming_date   16636 non-null  object 
 10  runtime_in_minutes  16483 non-null  float64
 11  studio_name         16222 non-null  object 
 12  tomatometer_status  16638 non-null  object 
 13  tomatometer_rating  16638 non-null  int64  
 14  tomatometer_count   16638 non-null  int64  
 15  audience_rating     16386 non-null  float64
 16  audi

***1. What does the distribution of films look like by rating? By primary genre? (hint: use first genre listed)***

In [105]:
def distribution_of_films(data, col):
    dff = data.copy();
    
    if col == "rating":
        for rating in ["PG-13)","R)"]:
            index_list = dff[dff[col] == rating].index.tolist()
            for index in index_list:
                dff.loc[index, col] = str(dff.loc[index, col])[:-1];
        dff_agg = pd.DataFrame({
            col.capitalize():dff[col].value_counts().index.tolist(),
            "Count":dff[col].value_counts().values.tolist()
        });
        dff_agg = dff_agg.sort_values(by=["Count"], ascending=False);
        fig = px.bar(dff_agg, x=col.capitalize(), y="Count", text_auto=True, title=f"Distribution of Films by {col.capitalize()}");
        fig.show();
    
    elif col == "genre":
        new_col = f"first_{col}";
        dff[new_col] = dff[col];
        for i in dff.index:
            try:
                dff.loc[i, new_col] = dff.loc[i, col].split(",")[0].strip();
            except:
                dff.loc[i, new_col] = dff.loc[i, col];
        dff_agg = pd.DataFrame({
            new_col.capitalize():dff[new_col].value_counts().index.tolist(),
            "Count":dff[new_col].value_counts().values.tolist()
        });
        dff_agg = dff_agg.sort_values(by=["Count"], ascending=False);
        fig = px.bar(dff_agg, x=new_col.capitalize(), y="Count", text_auto=True, title=f"Distribution of Films by {new_col.split('_')[0].capitalize()} {new_col.split('_')[1].capitalize()}");
        fig.update_layout(xaxis=dict(title=f"{new_col.split('_')[0].capitalize()} {new_col.split('_')[1].capitalize()}"));
        fig.show();

In [106]:
distribution_of_films(df, "rating")

In [107]:
distribution_of_films(df, "genre")

***2. What % of films received a Certified Fresh Tomatometer rating? What about Rotten?***

In [108]:
def percentage_tomatometer_rating(data, status):
    dff = data.copy();

    dff_agg = pd.DataFrame({
        "Status":dff["tomatometer_status"].value_counts().index.tolist(),
        "Count":dff["tomatometer_status"].value_counts().values.tolist()
    });

    dff_agg2 = pd.DataFrame({
        "Status":[status, f'Not {status}'],
        "Count":[
            np.sum(dff_agg[dff_agg["Status"] == status]["Count"]),
            np.sum(dff_agg[dff_agg["Status"] != status]["Count"])
        ]
    });

    fig = px.pie(dff_agg2, names="Status", values="Count", title=f"Percentage of Films that received a {status} Rating");
    fig.show();

In [109]:
percentage_tomatometer_rating(df, "Certified Fresh")

In [110]:
percentage_tomatometer_rating(df, "Rotten")

***3. Explore new film releases over time. How has the volume of releases by month trended over time? What year/month were the most new films released?***

In [111]:
def explore_film_releases(data):
    dff = data.copy();
    
    dff["in_theaters_date"] = pd.to_datetime(dff["in_theaters_date"], format='%Y-%m-%d');
    dff["release_date"] = dff["in_theaters_date"].dt.to_period("M");
    dff_agg = pd.DataFrame({
        "Release Date":dff["release_date"].value_counts().index.tolist(),
        "Count":dff["release_date"].value_counts().values.tolist()
    });
    dff_agg = dff_agg.sort_values(by=["Release Date"], ascending=True);

    dff_agg["Release Date"] = dff_agg["Release Date"].astype(str);
    fig = px.line(dff_agg, x="Release Date", y="Count", title="Volume of Film Releases over time");
    print(f'The most films were release in the following month & year: {dff_agg.sort_values(by=["Count"], ascending=False).loc[0, "Release Date"]}');
    print(f'Number of films released: {dff_agg.sort_values(by=["Count"], ascending=False).loc[0, "Count"]}');
    fig.show();

explore_film_releases(df)

The most films were release in the following month & year: 2014-10
Number of films released: 85


***4. Compare average Tomatometer ratings by Studio. Which studios produce the highest rated films, on average? The lowest?***

In [122]:
def tomatometer_ratings_by_studio(data, rating_quality):
    dff = data.copy();
    dff_agg = pd.DataFrame({
        "Studio":dff.groupby("studio_name").mean("tomatometer_rating").index.tolist(),
        "Average Rating":dff.groupby("studio_name").mean("tomatometer_rating")["tomatometer_rating"].values.tolist()
    });
    dff_agg = dff_agg.sort_values(by=["Average Rating","Studio"], ascending=[False,True]);
    title = f"Average Tomatometer Rating of {rating_quality} Rated Films produced by Studios";
    if (rating_quality == None):
        return dff_agg;
    else:
        if rating_quality == "Highest":
            highest_rating = np.max(dff_agg["Average Rating"]);
            print("Highest Average Tomatometer Rating of Films produced by Studios");
            display(dff_agg[dff_agg["Average Rating"] == highest_rating]);
        elif rating_quality == "Lowest":
            lowest_rating = np.min(dff_agg["Average Rating"]);
            print("Lowest Average Tomatometer Rating of Films produced by Studios");
            display(dff_agg[dff_agg["Average Rating"] == lowest_rating]);

In [113]:
tomatometer_ratings_by_studio(df, "Highest")

Highest Average Tomatometer Rating of Films produced by Studios


Unnamed: 0,Studio,Average Rating
12,20th Century Fox Television,100.0
39,@radical.media,100.0
75,Acme DVD Works,100.0
95,Alice Films,100.0
98,Alive Mind,100.0
...,...,...
2864,Zenith International Films Inc,100.0
2867,Zero Point Zero Films,100.0
2873,Zorro and Me Films,100.0
2875,Zweites Deutsches Fernsehen,100.0


In [114]:
tomatometer_ratings_by_studio(df, "Lowest")

Lowest Average Tomatometer Rating of Films produced by Studios


Unnamed: 0,Studio,Average Rating
3,120 Degree Films,0.0
26,3:1 Cinema,0.0
86,AfterDark Films,0.0
94,Alerion Services,0.0
128,Ambi Pictures,0.0
144,American-International Tel.,0.0
158,Anchor Bay Films/Freestyle,0.0
174,Animus Films,0.0
213,Armando Montelongo Productions,0.0
250,Astra Films,0.0


***5. Compare the Tomatometer ratings against audience ratings. Which films showed the largest discrepancies between audiences and critics?***

In [115]:
def tomatometer_vs_audience(data, discrepency_proportion):
    dff = data.copy();
    dff["audience_rating"] = dff["audience_rating"].fillna(0);
    dff["tomatometer_rating"] = dff["tomatometer_rating"].astype(float);
    dff["rating_discrepency"] = np.absolute(dff["tomatometer_rating"] - dff["audience_rating"]);
    dff_filter = dff[["movie_title","tomatometer_rating","audience_rating","rating_discrepency"]];
    dff_filter = dff_filter.sort_values(by=["rating_discrepency","movie_title"], ascending=[False,True]);
    if discrepency_proportion == None:
        return dff_filter;
    else:
        if discrepency_proportion == "Highest":
            highest = np.max(dff_filter["rating_discrepency"]);
            print("Films with the largest discrepency between audiences and critics");
            return dff_filter[dff_filter["rating_discrepency"] == highest];
        elif discrepency_proportion == "Lowest":
            lowest = np.min(dff_filter["rating_discrepency"]);
            print("Films with the lowest discrepency between audiences and critics");
            return dff_filter[dff_filter["rating_discrepency"] == lowest];

In [116]:
tomatometer_vs_audience(df, "Highest")

Films with the largest discrepency between audiences and critics


Unnamed: 0,movie_title,tomatometer_rating,audience_rating,rating_discrepency
2019,"A Bread Factory, Part Two: Walk with Me a While",100.0,0.0,100.0
2283,Afghan Cycles,100.0,0.0,100.0
2663,Anima,100.0,0.0,100.0
2805,Artik,100.0,0.0,100.0
2919,Aziz Ansari: Buried Alive,100.0,0.0,100.0
2920,Aziz Ansari: Live At Madison Square Garden,100.0,0.0,100.0
4047,Canary (Kanarie),100.0,0.0,100.0
4269,Chelsea Peretti: One of the Greats,100.0,0.0,100.0
4338,"Chris Rock: Kill the Messenger - London, New Y...",100.0,0.0,100.0
4516,Cold November,100.0,0.0,100.0


In [117]:
tomatometer_vs_audience(df, "Lowest")

Films with the lowest discrepency between audiences and critics


Unnamed: 0,movie_title,tomatometer_rating,audience_rating,rating_discrepency
1835,1:54,57.0,57.0,0.0
1904,30 Years to Life,67.0,67.0,0.0
4574,A Complete History of My Sexual Failures,56.0,56.0,0.0
2081,A Little Chaos,48.0,48.0,0.0
2085,A Long Way From Home,33.0,33.0,0.0
...,...,...,...,...
16504,XX/XY,44.0,44.0,0.0
16520,Year of the Fish,58.0,58.0,0.0
16534,Yesterday Was a Lie,83.0,83.0,0.0
16544,You Won't Miss Me,64.0,64.0,0.0


***6. Explore the critics concensus rating: what language is used most often?***

In [118]:
def languages_critics_consensus(data):
    language_names = {
        'af': 'Afrikaans',
        'ar': 'Arabic',
        'bg': 'Bulgarian',
        'bn': 'Bengali',
        'ca': 'Catalan',
        'cs': 'Czech',
        'cy': 'Welsh',
        'da': 'Danish',
        'de': 'German',
        'el': 'Greek',
        'en': 'English',
        'es': 'Spanish',
        'et': 'Estonian',
        'fa': 'Persian',
        'fi': 'Finnish',
        'fr': 'French',
        'gu': 'Gujarati',
        'he': 'Hebrew',
        'hi': 'Hindi',
        'hr': 'Croatian',
        'hu': 'Hungarian',
        'id': 'Indonesian',
        'it': 'Italian',
        'ja': 'Japanese',
        'kn': 'Kannada',
        'ko': 'Korean',
        'lt': 'Lithuanian',
        'lv': 'Latvian',
        'mk': 'Macedonian',
        'ml': 'Malayalam',
        'mr': 'Marathi',
        'ne': 'Nepali',
        'nl': 'Dutch',
        'no': 'Norwegian',
        'pa': 'Punjabi',
        'pl': 'Polish',
        'pt': 'Portuguese',
        'ro': 'Romanian',
        'ru': 'Russian',
        'sk': 'Slovak',
        'sl': 'Slovenian',
        'so': 'Somali',
        'sq': 'Albanian',
        'sv': 'Swedish',
        'sw': 'Swahili',
        'ta': 'Tamil',
        'te': 'Telugu',
        'th': 'Thai',
        'tl': 'Tagalog',
        'tr': 'Turkish',
        'uk': 'Ukrainian',
        'ur': 'Urdu',
        'vi': 'Vietnamese',
        'zh-cn': 'Chinese (Simplified)',
        'zh-tw': 'Chinese (Traditional)',
        'na': 'No Language'
    };
    dff_agg = pd.DataFrame(language_names.items(), columns=['Language Code', 'Language Name']);
    dff_agg["Count"] = 0;

    dff = data.copy();

    no_lang_count = 0;

    for i in dff.index:
        if i in dff[~dff["critics_consensus"].isnull()].index.tolist():
            try:
                language_results = detect_langs(dff.loc[i, "critics_consensus"]);
                for result in language_results:
                    for j in dff_agg.index:
                        if result.lang == dff_agg.loc[j, "Language Code"]:
                            dff_agg.loc[j, "Count"] += 1;
            except LangDetectException as e:
                no_lang_count += 1;
    dff_agg.loc[len(dff_agg)-1, "Count"] = no_lang_count;
    dff_agg = dff_agg.sort_values(by=["Count","Language Name"], ascending=[False,True]);
    dff_agg = dff_agg[dff_agg["Count"] > 0];

    fig = px.bar(dff_agg, x="Language Name", y="Count", text_auto=True, title="Frequenctly used Languages by Critics");
    fig.show();

languages_critics_consensus(df)

_Save Analysis Results to Excel Workbook_
> This is due to the sheer size of the analysis results

In [124]:
tomatometer_ratings_by_studio(df, None).to_csv(f"Results/{tomatometer_ratings_by_studio_filename}", index=False);

In [125]:
tomatometer_vs_audience(df, None).to_csv(f"Results/{tomatometer_vs_audience_filename}", index=False);