# Import libraries

In [403]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd

In [404]:
# Load data
df = pd.read_csv('movieReplicationSet.csv',skipinitialspace=True)

columns = df.columns 

# Helper functions

In [405]:
# Returns only the first 400 columns of the original data
def getMovieDataDf(df):
    return df.iloc[:,[i for i in range(400)]]

# Takes movie title
# Returns dataframe of specified movie
def getMovieDataByTitle(df, title):
    return df[title]

# Takes 2 dataframes and does row-wise elimination
# Returns 1 dataframe, each column is each dataframe
def rowWiseElim2(df1, df2):
    dfCombined = pd.concat([df1, df2], join = 'outer', axis = 1)
    return dfCombined.dropna()

In [406]:
# This function does median split based a given values in `sr`
# and split `df` into two groups
def medianSplitGroups(sr, df):
    median = sr.median()
    print(f"median value: {median}")
    group1 = []
    group2 = []

    for i in range(len(sr)):
        if sr[i] < median:
            group1.append(sr.index[i])
        else:
            group2.append(sr.index[i])
            
    group1Df = df.loc[:, group1]
    group2Df = df.loc[:, group2]
            
    # collapse the columns in each group into one column
    return (group1Df, group2Df)

In [407]:
# Initialize variables that will be used in multiple tests
movieDf = getMovieDataDf(df)
totalMovies = len(getMovieDataDf(df).columns)

genderCol = "Gender identity (1 = female; 2 = male; 3 = self-described)"
onlyChildCol = "Are you an only child? (1: Yes; 0: No; -1: Did not respond)"
socialCol = "Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)"

# Questions

## 1
Are movies that are more popular (operationalized as having more ratings) rated higher than movies that
are less popular? 

In [408]:
# split movies into more popular/ less popular groups
ratings_count = movieDf.count()

(lessPopular, morePopular) = medianSplitGroups(ratings_count, movieDf)

median value: 197.5


In [409]:
# Approach 1: find median of each user. 
# Clean data: row-wise elimination 
popRes = rowWiseElim2(lessPopular.median(axis=1), morePopular.median(axis=1))
lessPopCleaned = popRes.iloc[:,0]
morePopCleaned = popRes.iloc[:,1] 
stats.mannwhitneyu(morePopCleaned, lessPopCleaned, alternative='greater')

MannwhitneyuResult(statistic=644831.5, pvalue=1.7871975218238738e-19)

This Approach 1 however does not exactly give us ratings that are "representative" of each group

In [410]:
# Approach 2: median of each movie is representative
stats.mannwhitneyu(morePopular.median(), lessPopular.median(), alternative='greater')

MannwhitneyuResult(statistic=33427.5, pvalue=9.929258851707232e-35)

Conclusion: reject null hypothesis 

## 2
Are movies that are newer rated differently than movies that are older? [Hint: Do a median split of year of
release to contrast movies in terms of whether they are old or new]

In [411]:
# Get an array of all years of release
movieTitles = movieDf.columns
years = {}

# Get rid of movies with no year of release 
for title in movieTitles: 
    try: 
        year = int(title[title.find("(")+1:title.find(")")])
        years[title] = year
    except:
        print(f"title \'{title}\' does not have year of release")
        
# Construct pd series out of dictionary 
yearsRelease = pd.Series(years)

title 'Rambo: First Blood Part II' does not have year of release


In [412]:
# split movies into new / old movies
(oldMovies, newMovies) = medianSplitGroups(yearsRelease, movieDf)
print(f"There are {oldMovies.shape[1]} movies in the older group")
print(f"There are {newMovies.shape[1]} movies in the newer group")

median value: 1999.0
There are 196 movies in the older group
There are 203 movies in the newer group


In [413]:
# Find median of each movie, input into test
stats.mannwhitneyu(oldMovies.median(), newMovies.median())

MannwhitneyuResult(statistic=18075.0, pvalue=0.0962215340487528)

Conclusion: fail to reject: older and newer not different

## 3
Is enjoyment of ‘Shrek (2001)’ gendered, i.e. do male and female viewers rate it differently?

In [414]:
# Get only Shrek and Gender columns
shrekTitleCol = "Shrek (2001)"
shrekAndGender = df.loc[:, [shrekTitleCol, genderCol]]

# Split female and male groups
female = shrekAndGender[shrekAndGender[genderCol] == 1.0][shrekTitleCol] # female
male = shrekAndGender[shrekAndGender[genderCol] == 2.0][shrekTitleCol] # male

# Element wise elimination
female = female.dropna()
male = male.dropna()

stats.mannwhitneyu(female, male)

MannwhitneyuResult(statistic=96830.5, pvalue=0.050536625925559006)

Conclusion: fail to reject null --> not gendered

## 4
What proportion of movies are rated differently by male and female viewers?

In [415]:
# Returns movie dataframe for movie `titleCol`
# only by male or female `genderInd`
# 1.0 = female, 2.0 = male
def getMovieForGender(movieAndGender, titleCol, genderInd):
    return movieAndGender[movieAndGender[genderCol] == genderInd][titleCol].dropna()

In [416]:
# Returns a tuple of 
# 1) boolean: if the movie `titleCol` is gendered (mwu test)
# 2) p-value from the test
def isGendered(movieAndGender, titleCol):
    female = getMovieForGender(movieAndGender, titleCol, 1.0) # female
    male = getMovieForGender(movieAndGender, titleCol, 2.0) # male
    
    u, p = stats.mannwhitneyu(female, male)
    return (p < 0.005, p)

In [417]:
# Count the number of gendered movies and print
isGenderedCount = 0
for title in movieTitles:
    movieAndGender = df.loc[:, [title, genderCol]]
    isGendered_ = isGendered(movieAndGender, title)
    if isGendered_[0]:
        isGenderedCount += 1
        print(f"{title}, p = {isGendered_[1]}")
        
        
genderedProportion = isGenderedCount / float(totalMovies)

print()
print(f"num of gendered movies: {isGenderedCount}")
print(f"proportion of gendered movies: {genderedProportion}")

Django Unchained (2012), p = 0.00015542426769271815
Alien (1979), p = 6.542170586924406e-05
Fargo (1996), p = 0.003309364169873757
Star Wars: Episode IV - A New Hope (1977), p = 0.0016660989681143768
Indiana Jones and the Raiders of the Lost Ark (1981), p = 0.0013879291097348139
The Lost World: Jurassic Park (1997), p = 0.004065865964755651
13 Going on 30 (2004), p = 6.916495846569324e-06
Inglorious Bastards (2009), p = 0.0016227471622817877
Clueless (1995), p = 0.0012542980225539132
The Exorcist (1973), p = 0.000554106476103341
Pirates of the Caribbean: Dead Man's Chest (2006), p = 0.0018422351196893682
Funny Girl (1968), p = 0.0026228924286384424
The Thing (1982), p = 0.0004649694790599184
Elf (2003), p = 0.0036194778869859507
Andaz Apna Apna (1994), p = 0.0012642992058234505
The Proposal (2009), p = 1.60221441833309e-07
Girl Interrupted (1999), p = 0.001648810216538892
Divine Secrets of the Ya-Ya Sisterhood (2002), p = 0.0005835228101848203
Ghostbusters (2016), p = 2.364461526475323

## 5
Do people who are only children enjoy ‘The Lion King (1994)’ more than people with siblings?

In [418]:
# Get only the Lion King and Only Child column 
lionKingCol = "The Lion King (1994)"
lkAndChild = df.loc[:, [lionKingCol, onlyChildCol]]

# Split data into only child / has siblings
onlyChild = lkAndChild[lkAndChild[onlyChildCol] == 1.0][lionKingCol]
notOnlyChild = lkAndChild[lkAndChild[onlyChildCol] == 0.0][lionKingCol] # has siblings

# Element wise elimination
onlyChild = onlyChild.dropna()
notOnlyChild = notOnlyChild.dropna()

stats.mannwhitneyu(onlyChild, notOnlyChild, alternative="greater")

MannwhitneyuResult(statistic=52929.0, pvalue=0.978419092554931)

Conclusion: fail to reject null --> not greater

## 6
What proportion of movies exhibit an “only child effect”, i.e. are rated different by viewers with siblings
vs. those without?

In [419]:
# Returns a movie dataframe by a specific group
# `groupCol` the column to be split on
# `groupInd` the group to be returned
def getMovieForGroup(movieDf, titleCol, groupCol, groupInd):
    return movieDf[movieDf[groupCol] == groupInd][titleCol].dropna()

In [420]:
# Returns a tuple of 
# 1) whether specified groups `groupVal1` and `groupVal2` 
# are significantly different by mwu test
# 2) the p-value of the test

def isDifferent(movieDf, titleCol, groupCol, groupVal1, groupVal2):
    group1 = getMovieForGroup(movieDf, titleCol, groupCol, groupVal1) 
    group2 = getMovieForGroup(movieDf, titleCol, groupCol, groupVal2) 
    
    u, p = stats.mannwhitneyu(group1, group2)
    return (p < 0.005, p)

In [421]:
# Count the number of only child effect movies and print
isDifferentForOnlyChildCount = 0
for title in movieTitles:
    movieAndOnlyChild = df.loc[:, [title, onlyChildCol]]
    isDifferentOnlyChild_ = isDifferent(movieAndOnlyChild, title, onlyChildCol, 1.0, 0.0)
    if isDifferentOnlyChild_[0]:
        isDifferentForOnlyChildCount += 1
        print(f"{title}, p = {isDifferentOnlyChild_[1]}")
        
onlyChildProportion = isDifferentForOnlyChildCount / float(totalMovies)

print()
print(f"num of only child effect movies: {isDifferentForOnlyChildCount}")
print(f"proportion of only child effect movies: {onlyChildProportion}")

Billy Madison (1995), p = 0.0005383987317092497
The Blue Lagoon (1980), p = 0.0021346953003846527
Happy Gilmore (1996), p = 0.0010748032713519967
American Pie (1999), p = 0.00416798443480679
Star Wars: Episode VI - The Return of the Jedi (1983), p = 0.0033458769641976993
FeardotCom (2002), p = 0.004603506277083965
Captain America: Civil War (2016), p = 0.0037359441415383094

num of only child effect movies: 7
proportion of only child effect movies: 0.0175


## 7
Do people who like to watch movies socially enjoy ‘The Wolf of Wall Street (2013)’ more than those who
prefer to watch them alone?

In [422]:
# Get WoWS movie and social columns only 
wolfCol = "The Wolf of Wall Street (2013)"
wolfAndSocial = df.loc[:, [wolfCol, socialCol]]

# Split data into social viewers and viewers who like watching alone
social = wolfAndSocial[wolfAndSocial[socialCol] == 0.0][wolfCol]
alone = wolfAndSocial[wolfAndSocial[socialCol] == 1.0][wolfCol] 

# Element wise elimination
social = social.dropna()
alone = alone.dropna()

stats.mannwhitneyu(social, alone, alternative="greater")

MannwhitneyuResult(statistic=49303.5, pvalue=0.9436657996253056)

Conclusion: fail to reject null

## 8
What proportion of movies exhibit such a “social watching” effect?

In [423]:
# Use the previous helper functions,
# Count the number of social watching movies
isDifferentForSocial = 0
for title in movieTitles:
    movieAndSocial = df.loc[:, [title, socialCol]]
    isDifferentSocial_ = isDifferent(movieAndSocial, title, socialCol, 0.0, 1.0)
    if isDifferentSocial_[0]:
        isDifferentForSocial += 1
        print(f"{title}, p = {isDifferentSocial_[1]}")
        
socialProportion = isDifferentForSocial / float(totalMovies)

print()
print(f"num of only child effect movies: {isDifferentForSocial}")
print(f"proportion of only child effect movies: {socialProportion}")

The Silence of the Lambs (1991), p = 0.0005543299564219372
North (1994), p = 0.0046956853561537105
Inglorious Bastards (2009), p = 0.004998203056515189
Shrek 2 (2004), p = 0.00027961675102454795
The Avengers (2012), p = 0.001997878186385244
Spider-Man (2002), p = 0.002359069763461437
Donnie Darko (2001), p = 0.0001984126007752669
Apocalypse Now (1979), p = 0.0037307199889004597
The Transporter (2002), p = 0.004666380458592908
Captain America: Civil War (2016), p = 0.0009502320574457831

num of only child effect movies: 10
proportion of only child effect movies: 0.025


## 9
Is the ratings distribution of ‘Home Alone (1990)’ different than that of ‘Finding Nemo (2003)’?

In [424]:
# Get the two movies' dataframes
homeAlone = df['Home Alone (1990)']
nemo = df['Finding Nemo (2003)']

# Element wise elimination
homeAlone = homeAlone.dropna()
nemo = nemo.dropna()

stats.kstest(homeAlone, nemo)

KstestResult(statistic=0.15269080020897632, pvalue=6.379381467525036e-10)

Conclusion: reject the null --> is different

## 10
There are ratings on movies from several franchises ([‘Star Wars’, ‘Harry Potter’, ‘The Matrix’, ‘Indiana
Jones’, ‘Jurassic Park’, ‘Pirates of the Caribbean’, ‘Toy Story’, ‘Batman’]) in this dataset. How many of these
are of inconsistent quality, as experienced by viewers? [Hint: You can use the keywords in quotation marks
featured in this question to identify the movies that are part of each franchise]

In [425]:
franchises = ["Star Wars", "Harry Potter", "The Matrix", "Indiana Jones", "Jurassic Park", "Pirates of the Caribbean", "Toy Story", "Batman"]

countFranchise = 0
for franchise in franchises: 
    # row-wise elimination
    franchiseMovies = df.loc[:,df.columns.str.contains(franchise)].dropna() 
    
    # Constuct array of dataframes to input in the test
    movieArr = []
    for movieTitle in franchiseMovies.columns:
        movieArr.append(franchiseMovies[movieTitle])
    
    k, p = stats.kruskal(*movieArr)
    if p < 0.005:
        countFranchise += 1
        print(f"{franchise}, p = {p}")

print(f"\nThere are {countFranchise} franchises of inconsistent quality") 

Star Wars, p = 6.940162236984522e-40
The Matrix, p = 1.7537323830838066e-09
Indiana Jones, p = 1.020118354785894e-11
Jurassic Park, p = 1.8492328391686058e-11
Toy Story, p = 7.902234665149812e-06
Batman, p = 4.1380499020034183e-19

There are 6 franchises of inconsistent quality
