In [48]:
import requests
from lxml.html import parse, fromstring
import pandas as pd
import regex as re
from tqdm import tqdm
from pprint import pprint
from collections import Counter

from twitter_keys import tmdb_key
from params import annotated_submissions_file, submissions_tmdb_file

In [2]:
tmdb_file = "./data/tmdb.json"
tmdb_detailed_file = "./data/tmdb_detailed.json"

racism_keywords_file = "data/racism_keywords.txt"
sexism_keywords_file = "data/sexism_keywords.txt"

In [352]:
pd.set_option('display.max_colwidth', 50)

## Match my films with tmdb entries

In [295]:
submissions = pd.read_csv(annotated_submissions_file, sep=";")
submissions = submissions.loc[(submissions['num_comments'] >= 25) & (pd.isna(submissions['discussion_thread']))]

titles = submissions['film_title'].tolist()

In [296]:
submissions.loc[submissions.title.str.contains("Tower")]

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date,film_title
202,6rhe2a,Official Discussion: The Dark Tower [SPOILERS],880,2460,https://www.reddit.com/r/movies/comments/6rhe2...,1501812002,,2017-08-04 02:00:02,The Dark Tower


### Find potential matches

In [309]:
query_results = dict()

for i in tqdm(titles):
    j = re.search("[\(|\[](\d{4})[\)|\]]", i)
    k = re.sub("[\(|\[]\d{4}[\)|\]]", "", i)
    
    if j == None:
        r = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_key}&query={k}").json()
    else:
        r = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_key}&query={k}&year={j}").json()
    
    query_results[i] = r

100%|██████████████████████████████████████████████████████████████████████████████| 1149/1149 [03:32<00:00,  5.41it/s]


In [310]:
with open(tmdb_file, 'w') as outfile:
    json.dump(query_results, outfile)

In [3]:
with open(tmdb_file, 'r') as infile:
    query_results = json.load(infile)

### Find single best match

In [4]:
good_results = dict()
for i, j in query_results.items():
    # Print if there are no potential matches
    if j['total_results'] == 0:
        print(f"No possible results for {i}")
        
    # If the query only found a single result, I assume it's correct (manual inspection of titles found that it was)
    elif j['total_results'] == 1:
        good_results[i] = j['results'][0]
        
    # If the query found multiple results...
    elif j['total_results'] > 1:
        l = []
        stripped_name = re.sub("[\(|\[]\d{4}[\)|\]]", "", i).replace("&amp;", "&").lower().strip()
        
        # See if there are results in the correct date range with at least a few votes
        for k in j['results']:
            try:
                if (pd.to_datetime(k['release_date']).year > 2008) & (k['vote_count'] > 10):
                    l.append(k)
            except:
                pass
            
        # if not, see if there are results which an exact matching name
        if len(l) == 0:
            for k in j['results']:
                if k['title'].lower().replace("'", "").rstrip(".") == stripped_name:
                    l.append(k)
                    
                # Manual exception for LOTR since it was outside of the date range
                elif (i == "Lord of the Rings: The Fellowship of the Ring") & (k['vote_count'] > 10):
                    l.append(k)
                    
        # if there are multiple results in the date range, see what the more popular films are
        # Note I'm using 'if' over 'elif' because the previous step could find multiple matches
        if len(l) > 1:
            m = []
            for k in l:
                if (k['vote_count'] >= 70):
                    m.append(k)
            
            # If there is only 1 more popular film, that's the one I'm assuming is correct
            if len(m) == 1:
                good_results[i] = m[0]
                
            # If there are multiple see if there's one with an exact name match
            elif len(m) > 1:
                o = []
                for n in m:
                    if (n['title'].lower().replace("'", "").rstrip(".") == stripped_name):
                        o.append(n)
                        
                if len(o) == 1:
                    good_results[i] = o[0]
                    
                # If there is more than 1, select the most popular film out of those
                elif len(o) > 1:
                    q = 0
                    most_likely = dict()
                    for p in o:
                        if p['vote_count'] > q:
                            most_likely = p
                            q = p['vote_count']
                    good_results[i] = most_likely
                    
                # Print if there are no potential matches
                else:
                    print(i)
                    for n in m:
                        print(n['original_title']+" "+str(n['vote_count']))
                    print()
                    
            # Print if there are no potential matches
            else:
                print(f"Found 0 matches for {i}")
                    
        elif len(l) == 1:
            good_results[i] = l[0]
        else:                           
            print(f"Found 0 matches for {i} within the date range")

In [8]:
x=0
for i, j in good_results.items():
    print(i)
    pprint(j)
    print()
    x+=1
    if x ==1:
        break

Star Wars: Episode VIII – The Last Jedi
{'adult': False,
 'backdrop_path': '/5Iw7zQTHVRBOYpA0V6z0yypOPZh.jpg',
 'genre_ids': [878, 28, 12],
 'id': 181808,
 'original_language': 'en',
 'original_title': 'Star Wars: The Last Jedi',
 'overview': 'Rey develops her newly discovered abilities with the guidance of '
             'Luke Skywalker, who is unsettled by the strength of her powers. '
             'Meanwhile, the Resistance prepares to do battle with the First '
             'Order.',
 'popularity': 110.128,
 'poster_path': '/kOVEVeg59E0wsnXmF9nrh6OmWII.jpg',
 'release_date': '2017-12-13',
 'title': 'Star Wars: The Last Jedi',
 'video': False,
 'vote_average': 6.9,
 'vote_count': 12822}



## Find detailed movie information

In [10]:
detailed_results = dict()
for i, j in tqdm(good_results.items()):
    r = requests.get(f"https://api.themoviedb.org/3/movie/{j['id']}?api_key={tmdb_key}&append_to_response=keywords,credits")
    detailed_results[i] = r.json()

100%|██████████████████████████████████████████████████████████████████████████████| 1109/1109 [03:25<00:00,  5.41it/s]


In [297]:
# First 3 should be equally long
print(len(good_results))
print(len(detailed_results))
print(len(set(titles)))
print(len(titles)) # longer, because some films have multiple threads

1109
1109
1109
1149


In [197]:
x=0
genres = Counter()
keywords = Counter()
for i, j in detailed_results.items():
    print(i)
    print("-"*21+"Genres"+"-"*21)
    for l in range(0, len(j['genres'])):
        print(j['genres'][l]['name'])
    print("-"*20+"Keywords"+"-"*20)
    for l in j['keywords']['keywords']:
        print(l['name'])
    print()
    x+=1
    if x ==3:
        break

Star Wars: Episode VIII – The Last Jedi
---------------------Genres---------------------
Science Fiction
Action
Adventure
--------------------Keywords--------------------
bunker
space battle
failure
defeat
sequel
space opera
stormtrooper
military operation

Avengers: Endgame
---------------------Genres---------------------
Adventure
Science Fiction
Action
--------------------Keywords--------------------
space travel
time travel
time machine
sequel
based on comic
alien invasion
superhero team
iron man
marvel cinematic universe (mcu)
alternate timeline
final battle
father daughter relationship
sister sister relationship

Avengers: Infinity War
---------------------Genres---------------------
Adventure
Action
Science Fiction
--------------------Keywords--------------------
magic
sacrifice
superhero
based on comic
space
battlefield
genocide
magical object
super power
aftercreditsstinger
marvel cinematic universe (mcu)
cosmic



In [169]:
genres = Counter()
keywords = Counter()
for i, j in detailed_results.items():
    for l in range(0, len(j['genres'])):
        genres[j['genres'][l]['name']] += 1
    for l in j['keywords']['keywords']:
        keywords[l['name']] += 1

In [189]:
genres.most_common()

[('Drama', 469),
 ('Comedy', 348),
 ('Action', 346),
 ('Thriller', 308),
 ('Adventure', 260),
 ('Science Fiction', 205),
 ('Horror', 167),
 ('Crime', 164),
 ('Fantasy', 151),
 ('Family', 121),
 ('Mystery', 107),
 ('Animation', 93),
 ('Romance', 93),
 ('History', 81),
 ('Music', 43),
 ('War', 28),
 ('Western', 17),
 ('Documentary', 16),
 ('TV Movie', 6)]

In [171]:
keywords.most_common(20)

[('based on novel or book', 165),
 ('sequel', 127),
 ('based on true story', 96),
 ('murder', 83),
 ('biography', 77),
 ('based on comic', 70),
 ('woman director', 60),
 ('superhero', 58),
 ('duringcreditsstinger', 53),
 ('aftercreditsstinger', 52),
 ('new york city', 50),
 ('dystopia', 49),
 ('revenge', 46),
 ('remake', 37),
 ('family', 36),
 ('1970s', 30),
 ('magic', 28),
 ('1960s', 28),
 ('lgbt', 28),
 ('father daughter relationship', 27)]

### Find racism- and sexism-related keywords

In [172]:
src = "rac[e|i]|discri|black|afr|police|stereo|slav|protest|civil|malcolm|luther"
for i, j in keywords.items():
    if re.search(src, i):
        print(i, j)
        break
        

race_words = [
    'racist','interracial relationship','racism','race-conscious','race relations','mixed race','racial prejudice',
    'racial slur','biracial','racist cop','racial profiling','racial segregation','interracial marriage',
    'interracial friendship','post-racial america','interracial couple',
    'discrimination','africa','african american','african american history','african american comedy',
    'black panther party','black activist',
    'police brutality', 'police harassment',
    'stereotype',
    'slavery','slave','slave labor',
    'protest','protest march',
    'civil rights','civil rights movement',
    'malcolm x','martin luther king'
]

# 'race' is for Furious 7, a car racing film
        
# no typos
for k in race_words:
    assert k in keywords, k

black hole 2
black opal 1
black humor 2
black and white 6
black panther party 3
black magic 1
black soldier 1
black ops 4
black lgbt 2
black activist 2
blackmail 3
black belt 1
blackout 1
blacklist 1


In [173]:
src = "sex|lgbt|gay|lesbian|gender|fem|wom|miso|masc|coming|closet"    
for i, j in keywords.items():
    if re.search(src, i):
        print(i, j)
        break
        
    
sexism_words = [
    'male homosexuality','sexuality','repressed sexuality','repressed homosexuality','homosexual subtext','homosexuality',
    'sexual abuse','sexual identity','teenage sexuality','sexual harassment','sexism','battle of the sexes','transsexual',
    'childhood sexual abuse','bisexuality',
    'lgbt','black lgbt','lgbt teen','lgbt interest','jewish lgbt',
    'gay','gay theme','gay slur','gay interest','gay friends',
    'lesbian','lesbian relationship',
    'gender roles',
    'feminism','female empowerment','femininity','femme fatale',
    "women's rights",
    "misogynist",'misogyny',
    'masculinity','toxic masculinity',
    'coming out','in the closet'
]

# no typos
for k in sexism_words:
    assert k in keywords, k

female protagonist 19


### List how many racism and sexism keywords there are for each film

In [184]:
race_related_keywords = Counter()
sexism_related_keywords = Counter()
for i, j in detailed_results.items():
    x=0
    y=0
    for l in j['keywords']['keywords']:
        if l['name'] in race_words:
            x+=1
        if l['name'] in sexism_words:
            y+=1
    race_related_keywords[i] += x
    sexism_related_keywords[i] += y

In [185]:
race_related_keywords.most_common(10)

[('Get Out', 5),
 ('Straight Outta Compton', 5),
 ('Selma', 5),
 ('Belle', 4),
 ('Hidden Figures', 4),
 ('Loving', 4),
 ('Zootopia', 3),
 ('The Hate U Give', 3),
 ('One Night in Miami', 3),
 ('BlacKkKlansman', 2)]

In [186]:
for i, j in detailed_results.items():
    for k, l in race_related_keywords.items():
        if i == k:
            j['racism_keywords'] = l
    for k, l in sexism_related_keywords.items():
        if i == k:
            j['sexism_keywords'] = l

In [190]:
detailed_results['Moonlight']['sexism_keywords']

5

In [176]:
more_race = Counter()
for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in sexism_words:
            for m in j['keywords']['keywords']:
                if m['name'] not in sexism_words:
                    more_race[m['name']] += 1
            break

In [177]:
# Print cooccuring keywords:

more_race = Counter()
for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in sexism_words:
            for m in j['keywords']['keywords']:
                if m['name'] not in sexism_words:
                    more_race[m['name']] += 1
            break

more_race.most_common(10)

[('woman director', 12),
 ('based on true story', 9),
 ('based on novel or book', 9),
 ('coming of age', 9),
 ('biography', 7),
 ('parent child relationship', 6),
 ('period drama', 4),
 ('sequel', 4),
 ('1970s', 4),
 ('singer', 4)]

In [178]:
# Print for which films keywords appear

for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in ['ghetto']:
            print(i)
            break

Moonlight
Straight Outta Compton
Triple 9


In [196]:
# save the detailed file
with open(tmdb_detailed_file, 'w') as outfile:
    json.dump(detailed_results, outfile)

In [226]:
print(detailed_results['Moonlight'].keys())
print()

selected_keys = "release_date, vote_average, vote_count, sexism_keywords, racism_keywords".split(", ")

print(selected_keys)

dict_keys(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'keywords', 'credits', 'sexism_keywords', 'racism_keywords'])

['release_date', 'vote_average', 'vote_count', 'sexism_keywords', 'racism_keywords']


In [230]:
select_results = dict()

for i, j in detailed_results.items():
    x = {key: j[key] for key in selected_keys}
    
    for l in range(0, len(j['genres'])):
        x[j['genres'][l]['name']] = 1
    
    
    select_results[i] = x

In [246]:
results_df = pd.DataFrame.from_dict(select_results, orient="index")

results_df.fillna(0, inplace=True)

results_df = results_df.convert_dtypes()

results_df.reset_index(inplace=True)

results_df.rename(columns={"index":"film_title"}, inplace=True)

In [247]:
results_df.head()

Unnamed: 0,film_title,release_date,vote_average,vote_count,sexism_keywords,racism_keywords,Science Fiction,Action,Adventure,Fantasy,...,Mystery,War,Animation,Family,History,Western,Romance,Music,TV Movie,Documentary
0,Star Wars: Episode VIII – The Last Jedi,2017-12-13,6.9,12826,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Avengers: Endgame,2019-04-24,8.3,20566,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Avengers: Infinity War,2018-04-25,8.3,24120,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Star Wars: Episode IX - The Rise of Skywalker,2019-12-18,6.4,7605,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Star Wars: Episode VII - The Force Awakens,2015-12-15,7.3,16789,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [201]:
submissions.head()

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date,film_title
0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908,100558,https://www.reddit.com/r/movies/comments/7jwxn...,1513306809,,2017-12-15 03:00:09,Star Wars: Episode VIII – The Last Jedi
1,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046,89336,https://www.reddit.com/r/movies/comments/bh8ie...,1556247619,,2019-04-26 03:00:19,Avengers: Endgame
2,8f84h0,Official Discussion - Avengers: Infinity War [...,24045,72761,https://www.reddit.com/r/movies/comments/8f84h...,1524794408,,2018-04-27 02:00:08,Avengers: Infinity War
3,ed3a6g,Official Discussion - Star Wars: Episode IX - ...,17351,52017,https://www.reddit.com/r/movies/comments/ed3a6...,1576810828,,2019-12-20 03:00:28,Star Wars: Episode IX - The Rise of Skywalker
4,3xf9gd,Official Discussion - Star Wars: Episode VII -...,8394,40877,https://www.reddit.com/r/movies/comments/3xf9g...,1450493995,,2015-12-19 02:59:55,Star Wars: Episode VII - The Force Awakens


In [248]:
submissions_x = submissions.merge(results_df, on="film_title")

In [249]:
submissions_x.head()

Unnamed: 0,submission_id,title,score,num_comments,url,created,discussion_thread,date,film_title,release_date,...,Mystery,War,Animation,Family,History,Western,Romance,Music,TV Movie,Documentary
0,7jwxnd,Official Discussion - Star Wars: Episode VIII ...,15908,100558,https://www.reddit.com/r/movies/comments/7jwxn...,1513306809,,2017-12-15 03:00:09,Star Wars: Episode VIII – The Last Jedi,2017-12-13,...,0,0,0,0,0,0,0,0,0,0
1,bh8iei,Official Discussion - Avengers: Endgame [SPOIL...,20046,89336,https://www.reddit.com/r/movies/comments/bh8ie...,1556247619,,2019-04-26 03:00:19,Avengers: Endgame,2019-04-24,...,0,0,0,0,0,0,0,0,0,0
2,bk33kl,Official Discussion - Avengers: Endgame (2nd T...,694,4919,https://www.reddit.com/r/movies/comments/bk33k...,1556848803,,2019-05-03 02:00:03,Avengers: Endgame,2019-04-24,...,0,0,0,0,0,0,0,0,0,0
3,8f84h0,Official Discussion - Avengers: Infinity War [...,24045,72761,https://www.reddit.com/r/movies/comments/8f84h...,1524794408,,2018-04-27 02:00:08,Avengers: Infinity War,2018-04-25,...,0,0,0,0,0,0,0,0,0,0
4,8gvr6n,Official Discussion - Avengers: Infinity War [...,2484,11784,https://www.reddit.com/r/movies/comments/8gvr6...,1525399266,,2018-05-04 02:01:06,Avengers: Infinity War,2018-04-25,...,0,0,0,0,0,0,0,0,0,0


In [252]:
submissions_x.to_csv(submissions_tmdb_file, sep=";", index=False)

In [5]:
with open(racism_keywords_file, 'w') as f:
    for item in race_words:
        f.write(f"{item}\n")

In [None]:
with open(sexism_keywords_file, 'w') as f:
    for item in sexism_words:
        f.write(f"{item}\n")

In [7]:
", ".join(sexism_words)

"male homosexuality, sexuality, repressed sexuality, repressed homosexuality, homosexual subtext, homosexuality, sexual abuse, sexual identity, teenage sexuality, sexual harassment, sexism, battle of the sexes, transsexual, childhood sexual abuse, bisexuality, lgbt, black lgbt, lgbt teen, lgbt interest, jewish lgbt, gay, gay theme, gay slur, gay interest, gay friends, lesbian, lesbian relationship, gender roles, feminism, female empowerment, femininity, femme fatale, women's rights, misogynist, misogyny, masculinity, toxic masculinity, coming out, in the closet"