## 1. Import Required Libraries

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Loading Dataset

In [39]:
# Load the Netflix dataset
df = pd.read_csv(r'c:\Users\PC\Projects\datasets\Netflix TV Shows and Movies.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

Dataset Shape: (5283, 11)

First few rows:
   index        id                            title   type  \
0      0   tm84618                      Taxi Driver  MOVIE   
1      1  tm127384  Monty Python and the Holy Grail  MOVIE   
2      2   tm70993                    Life of Brian  MOVIE   
3      3  tm190788                     The Exorcist  MOVIE   
4      4   ts22164     Monty Python's Flying Circus   SHOW   

                                         description  release_year  \
0  A mentally unstable Vietnam War veteran works ...          1976   
1  King Arthur, accompanied by his squire, recrui...          1975   
2  Brian Cohen is an average young Jewish man, bu...          1979   
3  12-year-old Regan MacNeil begins to adapt an e...          1973   
4  A British sketch comedy series with the shows ...          1969   

  age_certification  runtime    imdb_id  imdb_score  imdb_votes  
0                 R      113  tt0075314         8.3    795222.0  
1                PG       91  t

In [40]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter(
    x=df['runtime'],
    y=df['imdb_score'],
    mode='markers',
    marker=dict(size=8, opacity=0.6),
    text=df['title'],  # Show title on hover
    hovertemplate='<b>%{text}</b><br>Runtime: %{x} min<br>IMDB Score: %{y}<extra></extra>'
))
fig.update_layout(title='Netflix: IMDB Score vs Runtime',
                 xaxis_title='Runtime (minutes)',
                 yaxis_title='IMDB Score')
fig.show()

In [41]:
## 3. Filter Highly Rated Content (IMDB Score >= 6)

# Create a new dataframe with movies/shows rated 6 and above
highly_rated = df[df['imdb_score'] >= 6].copy()

print(f"Total content: {len(df)}")
print(f"Highly rated content (6+): {len(highly_rated)}")
print(f"Excluded content (below 6): {len(df) - len(highly_rated)}")
print(f"\nPercentage of highly rated: {(len(highly_rated)/len(df)*100):.2f}%")
print(f"\nHighly Rated Statistics:")
print(highly_rated['imdb_score'].describe())

Total content: 5283
Highly rated content (6+): 3802
Excluded content (below 6): 1481

Percentage of highly rated: 71.97%

Highly Rated Statistics:
count    3802.000000
mean        7.099106
std         0.695121
min         6.000000
25%         6.500000
50%         7.000000
75%         7.600000
max         9.600000
Name: imdb_score, dtype: float64


In [42]:
print(highly_rated['imdb_votes'].describe())

count    3.790000e+03
mean     2.919340e+04
std      1.012420e+05
min      5.000000e+00
25%      6.165000e+02
50%      2.641500e+03
75%      1.283025e+04
max      2.268288e+06
Name: imdb_votes, dtype: float64


In [43]:
highly_rated_voted = highly_rated[highly_rated['imdb_votes'] >= 100000].copy()

In [44]:
print(f"Total highly rated content: {len(highly_rated)}")
print(f"Highly rated & votedcontent (6+/10k+): {len(highly_rated_voted)}")
print(f"Excluded content (below 6): {len(highly_rated) - len(highly_rated_voted)}")
print(f"\nPercentage of highly rated: {(len(highly_rated_voted)/len(df)*100):.2f}%")
print(f"\nHighly Rated Statistics:")
print(highly_rated_voted['imdb_score'].describe())

Total highly rated content: 3802
Highly rated & votedcontent (6+/10k+): 282
Excluded content (below 6): 3520

Percentage of highly rated: 5.34%

Highly Rated Statistics:
count    282.000000
mean       7.474113
std        0.785910
min        6.000000
25%        6.800000
50%        7.450000
75%        8.100000
max        9.500000
Name: imdb_score, dtype: float64


In [45]:
## 4. Scatter Plot: Highly Rated Content (IMDB Score >= 6, Votes >= 100k)

from scipy import stats

# Remove rows with invalid runtime data
clean_highly_rated = highly_rated_voted[(highly_rated_voted['runtime'] > 0) & 
                                        (highly_rated_voted['imdb_score'].notna())][['runtime', 'imdb_score']].copy()

print(f"Data for regression: {len(clean_highly_rated)} points")
print(f"Runtime range: {clean_highly_rated['runtime'].min()} - {clean_highly_rated['runtime'].max()}")
print(f"IMDB Score range: {clean_highly_rated['imdb_score'].min()} - {clean_highly_rated['imdb_score'].max()}")

if len(clean_highly_rated) > 2:
    # Calculate regression for highly rated content
    slope, intercept, r_value, p_value, std_err = stats.linregress(clean_highly_rated['runtime'], 
                                                                     clean_highly_rated['imdb_score'])
    r_squared = r_value**2
    
    print(f"Raw slope value: {slope}")
    print(f"Slope (formatted): {slope:.10f}")
    
    # Create regression line
    x_line = np.array([clean_highly_rated['runtime'].min(), clean_highly_rated['runtime'].max()])
    y_line = slope * x_line + intercept
    
    fig2 = go.Figure()
    
    # Scatter plot
    fig2.add_trace(go.Scatter(
        x=highly_rated_voted['runtime'],
        y=highly_rated_voted['imdb_score'],
        mode='markers',
        marker=dict(size=8, opacity=0.6, color='steelblue'),
        text=highly_rated_voted['title'],
        hovertemplate='<b>%{text}</b><br>Runtime: %{x} min<br>IMDB Score: %{y}<extra></extra>',
        name='Highly Rated Content'
    ))
    
    # Regression line
    fig2.add_trace(go.Scatter(
        x=x_line,
        y=y_line,
        mode='lines',
        line=dict(color='red', width=3),
        name='Trend Line',
        hovertemplate='Trend: %{y:.2f}<extra></extra>'
    ))
    
    fig2.update_layout(
        title=f'Highly Rated Netflix Content: IMDB Score vs Runtime ({len(highly_rated_voted)} titles)',
        xaxis_title='Runtime (minutes)',
        yaxis_title='IMDB Score',
        hovermode='closest',
        showlegend=True,
        height=600
    )
    fig2.show()
    
    print(f"\nRegression Analysis for Highly Rated Content:")
    print(f"R² value: {r_squared:.4f}")
    print(f"Slope: {slope:.10f}")
    print(f"Intercept: {intercept:.2f}")
    print(f"P-value: {p_value}")

Data for regression: 282 points
Runtime range: 13 - 229
IMDB Score range: 6.0 - 9.5
Raw slope value: -0.0073573291114723445
Slope (formatted): -0.0073573291



Regression Analysis for Highly Rated Content:
R² value: 0.1347
Slope: -0.0073573291
Intercept: 8.20
P-value: 2.03025209082411e-10


In [46]:
## 5. Scatter Plot: IMDB Score vs Votes for Highly Rated Content

# Remove rows with invalid vote data
clean_votes = highly_rated_voted[(highly_rated_voted['imdb_votes'] > 0) & 
                                 (highly_rated_voted['imdb_score'].notna())][['imdb_votes', 'imdb_score']].copy()

print(f"Data for regression: {len(clean_votes)} points")
print(f"Votes range: {clean_votes['imdb_votes'].min():,.0f} - {clean_votes['imdb_votes'].max():,.0f}")
print(f"IMDB Score range: {clean_votes['imdb_score'].min()} - {clean_votes['imdb_score'].max()}")

if len(clean_votes) > 2:
    # Calculate regression
    slope_votes, intercept_votes, r_value_votes, p_value_votes, std_err_votes = stats.linregress(clean_votes['imdb_votes'], 
                                                                                                   clean_votes['imdb_score'])
    r_squared_votes = r_value_votes**2
    
    print(f"Raw slope value: {slope_votes}")
    print(f"Slope (formatted): {slope_votes:.10f}")
    
    # Create regression line
    x_line_votes = np.array([clean_votes['imdb_votes'].min(), clean_votes['imdb_votes'].max()])
    y_line_votes = slope_votes * x_line_votes + intercept_votes
    
    fig3 = go.Figure()
    
    # Scatter plot
    fig3.add_trace(go.Scatter(
        x=highly_rated_voted['imdb_votes'],
        y=highly_rated_voted['imdb_score'],
        mode='markers',
        marker=dict(size=8, opacity=0.6, color='darkgreen'),
        text=highly_rated_voted['title'],
        hovertemplate='<b>%{text}</b><br>Votes: %{x:,.0f}<br>IMDB Score: %{y:.1f}<extra></extra>',
        name='Highly Rated Content'
    ))
    
    # Regression line
    fig3.add_trace(go.Scatter(
        x=x_line_votes,
        y=y_line_votes,
        mode='lines',
        line=dict(color='red', width=3),
        name='Trend Line',
        hovertemplate='Trend: %{y:.2f}<extra></extra>'
    ))
    
    fig3.update_layout(
        title=f'Highly Rated Netflix Content: IMDB Score vs Number of Votes ({len(highly_rated_voted)} titles)',
        xaxis_title='Number of IMDB Votes',
        yaxis_title='IMDB Score',
        hovermode='closest',
        showlegend=True,
        height=600
    )
    fig3.show()
    
    print(f"\nRegression Analysis for IMDB Score vs Votes:")
    print(f"R² value: {r_squared_votes:.4f}")
    print(f"Slope: {slope_votes:.10f}")
    print(f"Intercept: {intercept_votes:.2f}")
    print(f"P-value: {p_value_votes}")
    if p_value_votes < 0.05:
        print("✓ Result is statistically significant (p < 0.05)")
    else:
        print("✗ Result is NOT statistically significant (p >= 0.05)")

Data for regression: 282 points
Votes range: 100,575 - 2,268,288
IMDB Score range: 6.0 - 9.5
Raw slope value: 1.1216151811091788e-06
Slope (formatted): 0.0000011216



Regression Analysis for IMDB Score vs Votes:
R² value: 0.1365
Slope: 0.0000011216
Intercept: 7.16
P-value: 1.5030952428670326e-10
✓ Result is statistically significant (p < 0.05)


In [47]:
## 6. Top Content: High Scores (>8.5) or High Engagement (>700k votes)

# Filter for high scores or high engagement
top_content = highly_rated_voted[(highly_rated_voted['imdb_score'] > 8.5) | 
                                  (highly_rated_voted['imdb_votes'] > 700000)].copy()

# Select and sort columns
display_cols = ['title', 'type', 'imdb_score', 'imdb_votes', 'runtime', 'release_year']
top_content_display = top_content[display_cols].sort_values('imdb_score', ascending=False).reset_index(drop=True)

# Add index column (starting from 1)
top_content_display.insert(0, 'Index', range(1, len(top_content_display) + 1))

# Format for better readability
top_content_display['imdb_votes'] = top_content_display['imdb_votes'].apply(lambda x: f'{int(x):,}')
top_content_display['imdb_score'] = top_content_display['imdb_score'].apply(lambda x: f'{x:.1f}')

print(f"Found {len(top_content_display)} titles with score > 8.5 or votes > 700k\n")

# Create Plotly table for aesthetic display
fig_table = go.Figure(data=[go.Table(
    header=dict(
        values=['#', 'Title', 'Type', 'IMDB Score', 'Votes', 'Runtime (min)', 'Year'],
        fill_color='steelblue',
        align='center',
        font=dict(color='white', size=12),
        height=30
    ),
    cells=dict(
        values=[
            top_content_display['Index'],
            top_content_display['title'],
            top_content_display['type'],
            top_content_display['imdb_score'],
            top_content_display['imdb_votes'],
            top_content_display['runtime'],
            top_content_display['release_year']
        ],
        fill_color=['white' if i % 2 == 0 else '#f0f0f0' for i in range(len(top_content_display))],
        align=['center', 'left', 'left', 'center', 'center', 'center', 'center'],
        font=dict(size=11),
        height=25
    )
)])

fig_table.update_layout(
    title_text='Top Netflix Content: Score > 8.5 or Votes > 700k',
    height=600,
    margin=dict(l=50, r=50, t=50, b=50)
)

fig_table.show()

Found 36 titles with score > 8.5 or votes > 700k



## Conclusion

Based on the comprehensive analysis of Netflix movies and TV shows, I have identified **36 exceptional titles** that serve as my personalized watchlist. These titles were carefully curated using two key criteria:

1. **High IMDB Scores (> 8.5)**: These titles represent critically acclaimed content with ratings among the highest on IMDB, indicating exceptional quality and viewer satisfaction.

2. **High Engagement (> 700,000 votes)**: These titles have received substantial voter participation, demonstrating their popularity and widespread appeal across the Netflix audience.

### Key Findings:

- The analysis revealed a **statistically significant positive relationship** (p-value: 1.50e-10) between the number of votes a title receives and its IMDB score, confirming that more engaged audiences tend to rate content higher.

- Among the 282 highly-rated titles (IMDB ≥ 6.0 with ≥ 100,000 votes), only 36 met the premium criteria of exceptional scores and high engagement, making them standout recommendations.

### My Personalized Watchlist:

The 36 titles displayed in the table above represent the **best of Netflix** based on community ratings and engagement. These are the titles I have decided to prioritize for watching, as they combine critical acclaim with proven audience satisfaction. Each title has demonstrated both quality (through high IMDB scores) and popularity (through substantial vote counts), making them reliable choices for an excellent viewing experience.

This data-driven approach ensures that my Netflix viewing time is spent on content that is both critically appreciated and widely enjoyed by the community.