## Analyzing a Movie Dataset

### Getting general information about the dataset

In [None]:
# Loading the dataset
import pandas as pd
import numpy as np

movies_df = pd.read_csv("imdb_top_1000.csv")

# Print first few rows of the dataset
movies_df.head()


In [None]:
# Get general information about the dataframe
movies_df.info()

In [None]:
# Summary statistics of numerical columns
movies_df.describe()
# for this dataset, it doesn't tell me too much

In [None]:
movies_df.set_index('Series_Title', inplace=True) # Change index to be Title column
movies_df.head(4)

In [None]:
movies_df.tail(10)

In [None]:
# Add a 'Rank' column
movies_df['Rank'] = range(1, len(movies_df) + 1)
movies_df.info()

### Cleaning up the dataset

*Let's move the 'Rank' column to the top*

In [None]:
# get list of column names
new_columns = list(movies_df.columns)
# remove the 'Rank' column from the list
new_columns.remove('Rank')
# insert 'Rank' column at specified index
new_columns.insert(0, 'Rank')
# Reorder the DataFrame Columns
movies_df = movies_df[new_columns]
# Check DataFrame info
movies_df.info()

In [None]:
# Sorting the data by ratings
movies_df = movies_df.sort_values(by='IMDB_Rating', ascending=False)

movies_df.reset_index(inplace=True)

movies_df.set_index('Rank', inplace=True)

movies_df

In [None]:
movies_df.shape

In [None]:
movies_df.size

In [None]:
movies_df.sort_index().head()

In [None]:
# Handling missing data
print(movies_df.isnull().sum())

In [None]:
#movies_df.fillna(0)
#print(movies_df.isna().sum())
#lets remove rows with null data
movies_df_clean = movies_df.dropna()
movies_df_clean.info()

In [None]:
sum(movies_df.duplicated())

In [None]:
# convert gross to numeric type coerce errors to NaN for non numeric values
movies_df_clean['Gross'] = pd.to_numeric(movies_df_clean['Gross'], errors='coerce')
movies_df_clean.info()

In [None]:
movies_df_clean['Runtime'] = movies_df_clean['Runtime'].str.replace(' min', '')
# Convert the 'Runtime' column to numeric type
movies_df_clean['Runtime'] = pd.to_numeric(movies_df_clean['Runtime'], errors='coerce')
movies_df_clean.head()

In [None]:
# drop rows with missing gross data
movies_df.dropna(subset=['Gross'], inplace=True)
movies_df

In [None]:
movies_df['Released_Year'] = pd.to_datetime(movies_df['Released_Year'])
movies_df.head()
# doesnt work because dont have the full date only the year

In [None]:
movies_df['Released_Year'] = pd.to_datetime(movies_df['Released_Year'] + '-01-01')
movies_df.head()

In [None]:
movies_df['Released_Year'] = pd.to_datetime(movies_df['Released_Year'])
movies_df['Released_Year'] = movies_df['Released_Year'].dt.year

In [None]:
# drop unused columns
movies_df.drop(['Certificate', 'Poster_Link'], axis=1, inplace=True)
movies_df.shape

In [None]:
# String cleaning
movies_df['Genre'] = movies_df['Genre'].str.strip().str.lower()
print(movies_df['Genre'].head())

In [None]:
# correcting data types
movies_df['Gross'] = movies_df['Gross'].str.replace(',', '').astype(float)
movies_df

### Analyzing the data

In [None]:
# Get genre count 
movies_df['Genre'].value_counts().head(40)

In [None]:
# Show only movies with genre name Drama
movies_df[movies_df['Genre'] == 'Drama']

In [None]:
# Get movies with released after 2015
after_year = movies_df['Released_Year'] > '2015'
movies_df[after_year]

In [None]:
print(movies_df_clean['Runtime'].dtype) # checking runtime datatype

In [None]:
#movie with highest votes
# Sort the DataFrame by 'Number of Votes' column in descending order
movies_sorted_by_votes = movies_df_clean.sort_values(by='No_of_Votes', ascending=False)

# Get the top 5 movies with the most votes
top_movies_by_votes = movies_sorted_by_votes.head(5)

# Print the top movies with the most votes
print("Top 5 movies with the most votes:")
display(top_movies_by_votes)



In [None]:
#which movies have the longest runtime
# Find the movie with the longest runtime
movie_longest_runtime = movies_df_clean.nlargest(1, 'Runtime')

# Print the movie details
print("Movie with the longest runtime:")
movie_longest_runtime


In [None]:
#movie with shortest runtime
# Find the movie with the shortest runtime
movie_shortest_runtime = movies_df_clean.nsmallest(1, 'Runtime')

# Print the movie details
print("Movie with the shortest runtime:")
movie_shortest_runtime


In [None]:
# determine which year had the most released movies
best_year = movies_df.groupby('Released_Year').count()['Series_Title']
#best_year.shape
pd.set_option('display.max_rows', None)
best_year
#pd.reset_option('diplay.max_rows')

In [None]:
# which movies had the hishgest gross
max_gross_index = movies_df['Gross'].idxmax() #get index of movie with highest gross
movie_with_highest_gross = movies_df.loc[max_gross_index, 'Series_Title']
print(movie_with_highest_gross)

In [None]:
# Grouping the movies and looking at 
genre_group = movies_df.groupby('Genre').agg({'IMDB_Rating': 'mean', 'Gross': 'sum'})
print(genre_group)

Merging Files example

In [None]:
# loading new datasets
df1 = pd.read_csv("tmdb_5000_credits.csv")
df2 = pd.read_csv("tmdb_5000_movies.csv")
df1.info()

In [None]:
df2.info()

In [None]:
df1.columns = ['id', 'title', 'cast', 'crew']
df2 = df2.merge(df1, on='id')
df2.info()

In [None]:
# Look at the new data
df2.head(5)

In [None]:
# look at mean voting average
c = df2['vote_average'].mean()
c
#so most movies in this list is approx a 6 out of 10 rating

In [None]:
# lets get only movies that have a certain number of votes so movies with only 3 votes aren't included
m = df2['vote_count'].quantile(0.9)
qualify_movies = df2.copy().loc[df2['vote_count'] >= m] # make a new DataFrame
qualify_movies.shape #481 movies out of the 5000 movies qualify

In [None]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

qualify_movies['score'] = qualify_movies.apply(weighted_rating, axis=1)
qualify_movies = qualify_movies.sort_values('score', ascending=False)
qualify_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)