# Analyze Movies
This file is prepared to load some data about movies (80Mb CSV). It is a subset of 5 years (2016-2020) of the whole file: 500Mb.
Lines: 170,695
Columns: 29


---
# Load the data
Run the cell below once to load all movies data in a Pandas Dataframe. Consider a Dataframe as a Table on Steroids.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats


# Load the movies data into a DataFrame
movies_url = "https://github.com/himmelreich-it/jupyter-teaser-workshop/raw/refs/heads/main/recent_movies.zip"
df = pd.read_csv(movies_url, compression='zip')

# movies_url = "all_movies.csv"
# df = pd.read_csv(movies_url)

print(f"{len(df):,}")


In [None]:
print(df.shape)
df.head()

---
# Number of movies per year
Aggregate the number of movies per year

In [None]:
# Find top 10 directors by number of movies
# First, let's check how many movies have director information
movies_with_year = df[df['year'].notna()]
print(f"Total movies in dataset: {len(df)}")
print(f"Movies with year information: {len(movies_with_year)}")
print(f"Movies without year information: {len(df) - len(movies_with_year)}")

# Count movies per director
year_counts = movies_with_year['year'].value_counts()

# Get top 10 directors
year_counts = year_counts.reset_index()
year_counts.head(10)


---
# Average IMDB rating per director

- Extra: Add Standard Deviation and Movie Count
- Extra 2: Only directors with more than 5 movies are included

In [None]:
min_movie_count = 5

# Average IMDB rating per director
# Filter movies with both director and IMDB rating information
movies_with_director = df[(df['director'].notna()) & (df['imdb_rating'].notna()) & (df['imdb_rating'] > 0)]

# Group by director and calculate statistics
director_stats = movies_with_director.groupby('director').agg({
    'imdb_rating': ['mean', 'std', 'count']
}).round(2)

# Flatten column names
director_stats.columns = ['avg_rating', 'rating_std', 'movie_count']

# Filter directors with at least 5 movies for statistical relevance
director_stats = director_stats[director_stats['movie_count'] >= min_movie_count]

# Sort by average rating (descending) and reset index
director_table = director_stats.sort_values('avg_rating', ascending=False).reset_index()

# Display top 20 directors
director_table.head(20)

---
# Highest rated movies (20) below 100,000 USD
Lets exclude short movies, defined as <80min and make sure we only include movies with a large voting base.

In [None]:
# Define columns to display
display_columns = ['title', 'release_date','budget', 'imdb_rating', 'imdb_votes']

# Filter movies under $100k
low_budget = df[(df['budget'] < 100000) & (df['budget'] > 1000) & (df['runtime'] > 80) & (df['imdb_votes'] > 1000)]

# Find highest by combined rating
best_rated = low_budget.dropna(subset=['imdb_rating']).nlargest(20, 'imdb_rating').reset_index()

best_rated_short = best_rated[display_columns]
best_rated_short.head(20)

---
# Do longer movies cost more

Is there a correlation between the length of a movie and the cost associated with.
- Do we need to exclude outliers in budget?
- Do we only want to focus on "normal" movie length, 80-150min, or simply cater for outliers?
- Do we want to show some graphs?


In [None]:

rt_vs_budget = df[(df['runtime'] > 80) & (df['runtime'] < 300) & (df['budget'] > 500)].dropna(subset=['runtime', 'budget'])

print(f"Total movies with both Running Time and Budget: {len(rt_vs_budget)} ({(len(rt_vs_budget)/len(df))*100:.2f}%)")
print(f"Running Time range: {rt_vs_budget['runtime'].min():.0f} - {rt_vs_budget['runtime'].max():.0f} minutes")
print(f"Budgetrange: {rt_vs_budget['budget'].min():.1f} - {rt_vs_budget['budget'].max():.1f}")


In [None]:
# Calculate Pearson correlation coefficient
correlation, p_value = stats.pearsonr(rt_vs_budget['runtime'], rt_vs_budget['budget'])

print(f"\nPearson Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Relationship strength: {'Strong' if abs(correlation) > 0.7 else 'Moderate' if abs(correlation) > 0.3 else 'Weak'}")

# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scatter plot
axes[0,0].scatter(rt_vs_budget['runtime'], rt_vs_budget['budget'], alpha=0.6)
axes[0,0].set_xlabel('Running Time (minutes)')
axes[0,0].set_ylabel('Budget')
axes[0,0].set_title('Budget vs Running Time')

# Add trend line
z = np.polyfit(rt_vs_budget['runtime'], rt_vs_budget['budget'], 1)
p = np.poly1d(z)
axes[0,0].plot(rt_vs_budget['runtime'], p(rt_vs_budget['runtime']), "r--", alpha=0.8)

# Hexbin plot for density
axes[0,1].hexbin(rt_vs_budget['runtime'], rt_vs_budget['budget'], gridsize=30, cmap='Blues')
axes[0,1].set_xlabel('Running Time (minutes)')
axes[0,1].set_ylabel('Budget')
axes[0,1].set_title('Density Plot: Budget vs Running Time')

# Box plot by running time categories
# Create running time categories
clean_data_copy = rt_vs_budget.copy()
clean_data_copy['Runtime_Category'] = pd.cut(clean_data_copy['runtime'], 
                                           bins=[0, 90, 120, 150, 300], 
                                           labels=['Short (<90min)', 'Medium (90-120min)', 
                                                  'Long (120-150min)', 'Very Long (>150min)'])

clean_data_copy.boxplot(column='budget', by='Runtime_Category', ax=axes[1,0])
axes[1,0].set_title('Budget Distribution by Runtime Category')
axes[1,0].set_xlabel('Runtime Category')

# Distribution of running times
axes[1,1].hist(rt_vs_budget['runtime'], bins=50, alpha=0.7, edgecolor='black')
axes[1,1].set_xlabel('Running Time (minutes)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Distribution of Running Times')

plt.tight_layout()
plt.show()

---
# Do longer movies get a better rating

Is there a correlation between IMDB Rating and the Length of the Movie...
- Do we need to exclude outliers in rating?
- Do we only want to focus on "normal" movie length, 80-150min, or simply cater for outliers?
- Do we want to show some graphs?

In [None]:


# Clean the data - remove rows where either Running Time or IMDB Rating is missing
clean_data = df[(df['runtime'] > 10) & (df['runtime'] < 300) & (df['imdb_rating'] > 0) & (df['imdb_rating'] < 10)].dropna(subset=['runtime', 'imdb_rating'])

print(f"Total movies with both Running Time and IMDB Rating: {len(clean_data)} ({(len(clean_data)/len(df))*100:.2f}%)")
print(f"Running Time range: {clean_data['runtime'].min():.0f} - {clean_data['runtime'].max():.0f} minutes")
print(f"IMDB Rating range: {clean_data['imdb_rating'].min():.1f} - {clean_data['imdb_rating'].max():.1f}")

In [None]:
# Calculate Pearson correlation coefficient
correlation, p_value = stats.pearsonr(clean_data['runtime'], clean_data['imdb_rating'])

print(f"\nPearson Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Relationship strength: {'Strong' if abs(correlation) > 0.7 else 'Moderate' if abs(correlation) > 0.3 else 'Weak'}")

# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scatter plot
axes[0,0].scatter(clean_data['runtime'], clean_data['imdb_rating'], alpha=0.6)
axes[0,0].set_xlabel('Running Time (minutes)')
axes[0,0].set_ylabel('IMDB Rating')
axes[0,0].set_title('IMDB Rating vs Running Time')

# Add trend line
z = np.polyfit(clean_data['runtime'], clean_data['imdb_rating'], 1)
p = np.poly1d(z)
axes[0,0].plot(clean_data['runtime'], p(clean_data['runtime']), "r--", alpha=0.8)

# Hexbin plot for density
axes[0,1].hexbin(clean_data['runtime'], clean_data['imdb_rating'], gridsize=30, cmap='Blues')
axes[0,1].set_xlabel('Running Time (minutes)')
axes[0,1].set_ylabel('IMDB Rating')
axes[0,1].set_title('Density Plot: IMDB Rating vs Running Time')

# Box plot by running time categories
# Create running time categories
clean_data_copy = clean_data.copy()
clean_data_copy['Runtime_Category'] = pd.cut(clean_data_copy['runtime'], 
                                           bins=[0, 90, 120, 150, 300], 
                                           labels=['Short (<90min)', 'Medium (90-120min)', 
                                                  'Long (120-150min)', 'Very Long (>150min)'])

clean_data_copy.boxplot(column='imdb_rating', by='Runtime_Category', ax=axes[1,0])
axes[1,0].set_title('IMDB Rating Distribution by Runtime Category')
axes[1,0].set_xlabel('Runtime Category')

# Distribution of running times
axes[1,1].hist(clean_data['runtime'], bins=50, alpha=0.7, edgecolor='black')
axes[1,1].set_xlabel('Running Time (minutes)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].set_title('Distribution of Running Times')

plt.tight_layout()
plt.show()