In [None]:
import requests
from lxml.html import parse, fromstring
import pandas as pd
import regex as re
from tqdm import tqdm
from pprint import pprint
from collections import Counter

from twitter_keys import tmdb_key
from params import annotated_submissions_file, submissions_tmdb_file

In [None]:
tmdb_file = "./data/tmdb.json"
tmdb_detailed_file = "./data/tmdb_detailed.json"

racism_keywords_file = "data/racism_keywords.txt"
sexism_keywords_file = "data/sexism_keywords.txt"

In [None]:
pd.set_option('display.max_colwidth', 50)

## Match my films with tmdb entries

In [None]:
submissions = pd.read_csv(annotated_submissions_file, sep=";")
submissions = submissions.loc[(submissions['num_comments'] >= 25) & (pd.isna(submissions['discussion_thread']))]

titles = submissions['film_title'].tolist()

In [None]:
submissions.loc[submissions.title.str.contains("Tower")]

### Find potential matches

In [None]:
query_results = dict()

for i in tqdm(titles):
    j = re.search("[\(|\[](\d{4})[\)|\]]", i)
    k = re.sub("[\(|\[]\d{4}[\)|\]]", "", i)
    
    if j == None:
        r = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_key}&query={k}").json()
    else:
        r = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_key}&query={k}&year={j}").json()
    
    query_results[i] = r

In [None]:
with open(tmdb_file, 'w') as outfile:
    json.dump(query_results, outfile)

In [None]:
with open(tmdb_file, 'r') as infile:
    query_results = json.load(infile)

### Find single best match

In [None]:
good_results = dict()
for i, j in query_results.items():
    # Print if there are no potential matches
    if j['total_results'] == 0:
        print(f"No possible results for {i}")
        
    # If the query only found a single result, I assume it's correct (manual inspection of titles found that it was)
    elif j['total_results'] == 1:
        good_results[i] = j['results'][0]
        
    # If the query found multiple results...
    elif j['total_results'] > 1:
        l = []
        stripped_name = re.sub("[\(|\[]\d{4}[\)|\]]", "", i).replace("&amp;", "&").lower().strip()
        
        # See if there are results in the correct date range with at least a few votes
        for k in j['results']:
            try:
                if (pd.to_datetime(k['release_date']).year > 2008) & (k['vote_count'] > 10):
                    l.append(k)
            except:
                pass
            
        # if not, see if there are results which an exact matching name
        if len(l) == 0:
            for k in j['results']:
                if k['title'].lower().replace("'", "").rstrip(".") == stripped_name:
                    l.append(k)
                    
                # Manual exception for LOTR since it was outside of the date range
                elif (i == "Lord of the Rings: The Fellowship of the Ring") & (k['vote_count'] > 10):
                    l.append(k)
                    
        # if there are multiple results in the date range, see what the more popular films are
        # Note I'm using 'if' over 'elif' because the previous step could find multiple matches
        if len(l) > 1:
            m = []
            for k in l:
                if (k['vote_count'] >= 70):
                    m.append(k)
            
            # If there is only 1 more popular film, that's the one I'm assuming is correct
            if len(m) == 1:
                good_results[i] = m[0]
                
            # If there are multiple see if there's one with an exact name match
            elif len(m) > 1:
                o = []
                for n in m:
                    if (n['title'].lower().replace("'", "").rstrip(".") == stripped_name):
                        o.append(n)
                        
                if len(o) == 1:
                    good_results[i] = o[0]
                    
                # If there is more than 1, select the most popular film out of those
                elif len(o) > 1:
                    q = 0
                    most_likely = dict()
                    for p in o:
                        if p['vote_count'] > q:
                            most_likely = p
                            q = p['vote_count']
                    good_results[i] = most_likely
                    
                # Print if there are no potential matches
                else:
                    print(i)
                    for n in m:
                        print(n['original_title']+" "+str(n['vote_count']))
                    print()
                    
            # Print if there are no potential matches
            else:
                print(f"Found 0 matches for {i}")
                    
        elif len(l) == 1:
            good_results[i] = l[0]
        else:                           
            print(f"Found 0 matches for {i} within the date range")

In [None]:
x=0
for i, j in good_results.items():
    print(i)
    pprint(j)
    print()
    x+=1
    if x ==1:
        break

## Find detailed movie information

In [None]:
detailed_results = dict()
for i, j in tqdm(good_results.items()):
    r = requests.get(f"https://api.themoviedb.org/3/movie/{j['id']}?api_key={tmdb_key}&append_to_response=keywords,credits")
    detailed_results[i] = r.json()

In [None]:
# First 3 should be equally long
print(len(good_results))
print(len(detailed_results))
print(len(set(titles)))
print(len(titles)) # longer, because some films have multiple threads

In [None]:
x=0
genres = Counter()
keywords = Counter()
for i, j in detailed_results.items():
    print(i)
    print("-"*21+"Genres"+"-"*21)
    for l in range(0, len(j['genres'])):
        print(j['genres'][l]['name'])
    print("-"*20+"Keywords"+"-"*20)
    for l in j['keywords']['keywords']:
        print(l['name'])
    print()
    x+=1
    if x ==3:
        break

In [None]:
genres = Counter()
keywords = Counter()
for i, j in detailed_results.items():
    for l in range(0, len(j['genres'])):
        genres[j['genres'][l]['name']] += 1
    for l in j['keywords']['keywords']:
        keywords[l['name']] += 1

In [None]:
genres.most_common()

In [None]:
keywords.most_common(20)

### Find racism- and sexism-related keywords

In [None]:
src = "rac[e|i]|discri|black|afr|police|stereo|slav|protest|civil|malcolm|luther"
for i, j in keywords.items():
    if re.search(src, i):
        print(i, j)
        break
        

race_words = [
    'racist','interracial relationship','racism','race-conscious','race relations','mixed race','racial prejudice',
    'racial slur','biracial','racist cop','racial profiling','racial segregation','interracial marriage',
    'interracial friendship','post-racial america','interracial couple',
    'discrimination','africa','african american','african american history','african american comedy',
    'black panther party','black activist',
    'police brutality', 'police harassment',
    'stereotype',
    'slavery','slave','slave labor',
    'protest','protest march',
    'civil rights','civil rights movement',
    'malcolm x','martin luther king'
]

# 'race' is for Furious 7, a car racing film
        
# no typos
for k in race_words:
    assert k in keywords, k

In [None]:
src = "sex|lgbt|gay|lesbian|gender|fem|wom|miso|masc|coming|closet"    
for i, j in keywords.items():
    if re.search(src, i):
        print(i, j)
        break
        
    
sexism_words = [
    'male homosexuality','sexuality','repressed sexuality','repressed homosexuality','homosexual subtext','homosexuality',
    'sexual abuse','sexual identity','teenage sexuality','sexual harassment','sexism','battle of the sexes','transsexual',
    'childhood sexual abuse','bisexuality',
    'lgbt','black lgbt','lgbt teen','lgbt interest','jewish lgbt',
    'gay','gay theme','gay slur','gay interest','gay friends',
    'lesbian','lesbian relationship',
    'gender roles',
    'feminism','female empowerment','femininity','femme fatale',
    "women's rights",
    "misogynist",'misogyny',
    'masculinity','toxic masculinity',
    'coming out','in the closet'
]

# no typos
for k in sexism_words:
    assert k in keywords, k

### List how many racism and sexism keywords there are for each film

In [None]:
race_related_keywords = Counter()
sexism_related_keywords = Counter()
for i, j in detailed_results.items():
    x=0
    y=0
    for l in j['keywords']['keywords']:
        if l['name'] in race_words:
            x+=1
        if l['name'] in sexism_words:
            y+=1
    race_related_keywords[i] += x
    sexism_related_keywords[i] += y

In [None]:
race_related_keywords.most_common(10)

In [None]:
for i, j in detailed_results.items():
    for k, l in race_related_keywords.items():
        if i == k:
            j['racism_keywords'] = l
    for k, l in sexism_related_keywords.items():
        if i == k:
            j['sexism_keywords'] = l

In [None]:
detailed_results['Moonlight']['sexism_keywords']

In [None]:
more_race = Counter()
for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in sexism_words:
            for m in j['keywords']['keywords']:
                if m['name'] not in sexism_words:
                    more_race[m['name']] += 1
            break

In [None]:
# Print cooccuring keywords:

more_race = Counter()
for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in sexism_words:
            for m in j['keywords']['keywords']:
                if m['name'] not in sexism_words:
                    more_race[m['name']] += 1
            break

more_race.most_common(10)

In [None]:
# Print for which films keywords appear

for i, j in detailed_results.items():
    for l in j['keywords']['keywords']:
        if l['name'] in ['ghetto']:
            print(i)
            break

In [None]:
# save the detailed file
with open(tmdb_detailed_file, 'w') as outfile:
    json.dump(detailed_results, outfile)

In [None]:
print(detailed_results['Moonlight'].keys())
print()

selected_keys = "release_date, vote_average, vote_count, sexism_keywords, racism_keywords".split(", ")

print(selected_keys)

In [None]:
select_results = dict()

for i, j in detailed_results.items():
    x = {key: j[key] for key in selected_keys}
    
    for l in range(0, len(j['genres'])):
        x[j['genres'][l]['name']] = 1
    
    
    select_results[i] = x

In [None]:
results_df = pd.DataFrame.from_dict(select_results, orient="index")

results_df.fillna(0, inplace=True)

results_df = results_df.convert_dtypes()

results_df.reset_index(inplace=True)

results_df.rename(columns={"index":"film_title"}, inplace=True)

In [None]:
results_df.head()

In [None]:
submissions.head()

In [None]:
submissions_x = submissions.merge(results_df, on="film_title")

In [None]:
submissions_x.head()

In [None]:
submissions_x.to_csv(submissions_tmdb_file, sep=";", index=False)

In [None]:
with open(racism_keywords_file, 'w') as f:
    for item in race_words:
        f.write(f"{item}\n")

In [None]:
with open(sexism_keywords_file, 'w') as f:
    for item in sexism_words:
        f.write(f"{item}\n")

In [None]:
", ".join(sexism_words)