In [2]:
# import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats

In [3]:
# read and check data
data = pd.read_csv('movieReplicationSet.csv')
print(data.head())

   The Life of David Gale (2003)  Wing Commander (1999)  \
0                            NaN                    NaN   
1                            NaN                    NaN   
2                            NaN                    NaN   
3                            NaN                    NaN   
4                            NaN                    NaN   

   Django Unchained (2012)  Alien (1979)  \
0                      4.0           NaN   
1                      1.5           NaN   
2                      NaN           NaN   
3                      2.0           NaN   
4                      3.5           NaN   

   Indiana Jones and the Last Crusade (1989)  Snatch (2000)  \
0                                        3.0            NaN   
1                                        NaN            NaN   
2                                        NaN            NaN   
3                                        3.0            NaN   
4                                        0.5            NaN   

 

In [6]:
# only get the movie ratings
movie_ratings = data.iloc[:, :400]

#### Q1

Are movies that are more popular (operationalized as having more ratings) rated higher than movies that are less popular? [Hint: You can do a median-split of popularity to determinehigh vs. low popularitymovies]

In [7]:

movie_stats = pd.DataFrame({
        'n_ratings': movie_ratings.notna().sum(),  # Number of ratings (popularity)
        'median_rating': movie_ratings.median(),       # Median rating
})

In [9]:
movie_stats.head()

Unnamed: 0,n_ratings,median_rating
The Life of David Gale (2003),76,2.5
Wing Commander (1999),71,2.0
Django Unchained (2012),453,3.5
Alien (1979),289,3.0
Indiana Jones and the Last Crusade (1989),463,3.0


In [11]:
# seperate popular and unpopular movies with median
median_ratings = movie_stats['n_ratings'].median()
movie_stats['is_popular'] = movie_stats['n_ratings'] > median_ratings
high_popularity = movie_stats[movie_stats['is_popular']]['median_rating']
low_popularity = movie_stats[~movie_stats['is_popular']]['median_rating']    
    

In [13]:
statistic, p_value = stats.mannwhitneyu(
    high_popularity.dropna(),
    low_popularity.dropna(),
    alternative='greater'  # Using one-sided test
)
print(p_value)

9.929258851707232e-35


#### Q2

Are movies that are newer rated differently than movies that are older? [Hint: Do a median split of year of release to contrast movies in terms of whether they are old or new]

In [15]:
q2_data = pd.DataFrame({
        'year': movie_ratings.columns.str.extract(r'(\d{4})')[0].astype(int),  # Extract year from column names and flatten
        'median_rating': movie_ratings.median().values,       # Median rating
})

In [16]:
q2_data.head()

Unnamed: 0,year,median_rating
0,2003,2.5
1,1999,2.0
2,2012,3.5
3,1979,3.0
4,1989,3.0


In [17]:
# sepearte old and new movies with median
median_year = q2_data['year'].median()
q2_data['is_old'] = q2_data['year'] > median_year
old = q2_data[q2_data['is_old']]['median_rating']
not_old = q2_data[~q2_data['is_old']]['median_rating']    

In [18]:
statistic, p_value = stats.mannwhitneyu(
    old.dropna(),
    not_old.dropna(),
    alternative='two-sided'  # Using two-sided test unless there's a specific directional hypothesis
)
print(p_value)

0.19865156776112602


#### Q3

Is enjoyment of ‘Shrek(2001)’ gendered, i.e. do male and female viewers rate it differently

In [19]:
q3_data = pd.DataFrame({
        'rating': data['Shrek (2001)'],
        'gender': data.iloc[:, 474]  # Gender is in column 475 (0-based indexing)
    })

In [20]:
# get female and male ratings
Shrek_female = q3_data[q3_data['gender'] == 1]['rating'].dropna()
Shrek_male = q3_data[q3_data['gender'] == 2]['rating'].dropna()

In [21]:
statistic, p_value = stats.mannwhitneyu(
    Shrek_female.dropna(),
    Shrek_male.dropna(),
    alternative='two-sided'  # Using two-sided test unless there's a specific directional hypothesis
)
print(p_value)

0.050536625925559006


#### Q4

What proportion of movies are rated differently by male and female viewers

In [22]:
q4_p_values = []
for i in range(400):
    i_th_data = pd.DataFrame({
        'rating': data.iloc[:, i],
        'gender': data.iloc[:, 474]
    })
    female = i_th_data[i_th_data['gender'] == 1]['rating'].dropna()
    male = i_th_data[i_th_data['gender'] == 2]['rating'].dropna()
    #other = i_th_data[i_th_data['gender'] == 3]['rating'].dropna()
    q4_statistic, q4_p_value = stats.mannwhitneyu(female, male, alternative='two-sided')
    q4_p_values.append(q4_p_value)

In [23]:
# see how many is significant
sum(np.array(q4_p_values) < 0.005)/400

0.125

#### Q5
Do people who are only children enjoy ‘The Lion King(1994)’ more than people with siblings?

In [24]:
q_5_data = pd.DataFrame({
        'rating': data['The Lion King (1994)'],
        'only_child': data.iloc[:, 475]
    })

In [25]:
# check the ratings of only child and not only child
q_5_data['only_child'].value_counts()

only_child
 0    894
 1    177
-1     26
Name: count, dtype: int64

In [28]:
only_child_rating = q_5_data[q_5_data['only_child'] == 1]['rating'].dropna()
not_only_child_rating = q_5_data[q_5_data['only_child'] == 0]['rating'].dropna()
statistic, p_value = stats.mannwhitneyu(
    only_child_rating.dropna(),
    not_only_child_rating.dropna(),
    alternative='greater'  # Using one-sided test
)
print(p_value)

0.978419092554931


#### Q6
What proportion of movies exhibit an “only child effect”, i.e. are rated different by viewers with siblings vs. those without?

In [30]:
q6_p_values = []
for i in range(400):
    i_th_data = pd.DataFrame({
        'rating': data.iloc[:, i],
        'only_child': data.iloc[:, 475]
    })
    only_child_rating = i_th_data[i_th_data['only_child'] == 1]['rating'].dropna()
    not_only_child_rating = i_th_data[i_th_data['only_child'] == 0]['rating'].dropna()
    q6_statistic, q6_p_value = stats.mannwhitneyu(
    only_child_rating.dropna(),
    not_only_child_rating.dropna(),
    alternative='two-sided'  # Using two-sided test unless there's a specific directional hypothesis
)
    q6_p_values.append(q6_p_value)
# check the portion of significant p values
print(sum(np.array(q6_p_values) < 0.005)/400)

0.0175


#### Q7
Do people who like to watch movies socially enjoy ‘The Wolf of Wall Street (2013)’ more than those who preferto watch them alone?

In [31]:
q7_data = pd.DataFrame({
        'rating': data['The Wolf of Wall Street (2013)'],
        'social': data.iloc[:, 476]
    })

In [32]:
q7_alone_rating = q7_data[q7_data['social'] == 1]['rating'].dropna()
q7_not_alone_rating = q7_data[q7_data['social'] == 0]['rating'].dropna()
statistic, p_value = stats.mannwhitneyu(
    q7_alone_rating.dropna(),
    q7_not_alone_rating.dropna(),
    alternative='greater'  # Using one-sided test
)
print(p_value)

0.05638214666114455


#### Q8
What proportion of movies exhibit such a “social watching” effect

In [34]:
q8_p_values = []
for i in range(400):
    i_th_data = pd.DataFrame({
        'rating': data.iloc[:, i],
        'social': data.iloc[:, 476]
    })
    alone_rating = i_th_data[i_th_data['social'] == 1]['rating'].dropna()
    not_alone_rating = i_th_data[i_th_data['social'] == 0]['rating'].dropna()
    q8_statistic, q8_p_value = stats.mannwhitneyu(
    alone_rating.dropna(),
    not_alone_rating.dropna(),
    alternative='two-sided'  # Using two-sided test unless there's a specific directional hypothesis
)
    q8_p_values.append(q8_p_value)
# check the portion of significant p values
print(sum(np.array(q8_p_values) < 0.005)/400)

0.025


#### Q9
Is the ratings distribution of ‘Home Alone (1990)’ different than that of ‘Finding Nemo (2003)

In [35]:
home_alone_rating = data['Home Alone (1990)']
finding_nemo_rating = data['Finding Nemo (2003)']

In [36]:
# do a ks test for difference between two distribtuions in a non-parametric way
ks_statistic, ks_p_value = stats.ks_2samp(home_alone_rating.dropna(), finding_nemo_rating.dropna())
ks_p_value

6.379397182836346e-10

#### Q10
There are ratings on moviesfrom several franchises ([‘Star Wars’, ‘Harry Potter’, ‘The Matrix’, ‘Indiana Jones’, ‘Jurassic Park’, ‘Pirates of the Caribbean’, ‘Toy Story’, ‘Batman’]) inthis dataset. How many of these are of inconsistent quality, as experienced by viewers? [Hint: You can use the keywords in quotation marks featured in this question to identify the movies that are part of each franchise

In [41]:
# List of franchises
franchises = ['Star Wars', 'Harry Potter', 'The Matrix', 'Indiana Jones', 
              'Jurassic Park', 'Pirates of the Caribbean', 'Toy Story', 'Batman']

# Create a dictionary to store variance for each franchise
franchise_films = {}

# Loop through each franchise, calculate variance of ratings
for franchise in franchises:
    # Filter columns containing the franchise name
    franchise_movies = data.filter(like=franchise)
    franchise_films[franchise] = franchise_movies

In [43]:
for franchise in franchises:
    franchise_movies = data.filter(like=franchise)
    
    # Convert columns to list of arrays for Kruskal-Wallis test
    movie_ratings = [franchise_movies[col].dropna() for col in franchise_movies.columns]
    kw_statistic, p_value = stats.kruskal(*movie_ratings)
    print(f'{franchise}: {p_value}')

Star Wars: 8.01647736660335e-48
Harry Potter: 0.34331950837289205
The Matrix: 3.1236517880781424e-11
Indiana Jones: 6.27277563979608e-10
Jurassic Park: 7.636930084362221e-11
Pirates of the Caribbean: 3.2901287079094474e-05
Toy Story: 5.065805156537524e-06
Batman: 4.2252969509030006e-42


#### Extra

Do people who enjoy driving fast (>=4) tend to rate The Wolf of Wall Street (2013) differently?

In [44]:
enjoy_driving_fast = data[data['I enjoy driving fast'] >= 4]
do_not_enjoy_driving_fast = data[data['I enjoy driving fast'] < 4]

fast_rating = enjoy_driving_fast['The Wolf of Wall Street (2013)'].dropna()
not_fast_rating = do_not_enjoy_driving_fast['The Wolf of Wall Street (2013)'].dropna()

statistic, p_value = stats.mannwhitneyu(
    fast_rating,
    not_fast_rating,
    alternative='two-sided'  # Using two-sided test unless there's a specific directional hypothesis
)

print(p_value)

6.715186980213809e-05
