### Import Required Libraries and Set Up Environment Variables

In [2]:
import sys
print(sys.path)

['c:\\Repos\\data-sourcing-challenge', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\python310.zip', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\DLLs', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\lib', 'c:\\Users\\Jake\\anaconda3\\envs\\dev', '', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\lib\\site-packages', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\lib\\site-packages\\win32', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\lib\\site-packages\\win32\\lib', 'c:\\Users\\Jake\\anaconda3\\envs\\dev\\lib\\site-packages\\Pythonwin']


In [3]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import numpy as np

In [4]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [5]:
# Set the base URL
uri = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key={nyt_api_key}&"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:("Movies") AND type_of_material:("Review") AND headline:("love")'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
url = uri + f'fq={filter_query}&sort={sort}&fl={field_list}&begindate={begin_date}&end_date={end_date}'

url

'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=b7yQGXkHbCIfiakav77zRdZ5Nu1QjgqU&fq=section_name:("Movies") AND type_of_material:("Review") AND headline:("love")&sort=newest&fl=headline,web_url,snippet,source,keywords,pub_date,byline,word_count&begindate=20130101&end_date=20230531'

In [6]:
# Create an empty list to store the reviews
reviews_list = []
page_nbr = 1

# loop through pages 0-19
while page_nbr < 20:

    # create query with a page number
    # API results show 10 articles at a time
    # Make a "GET" request and retrieve the JSON
    response = requests.get(url + f'&page={page_nbr}')
    
    # Try and save the reviews to the reviews_list
    # add the reviews from the response to the stored reviews_list variable
    data = response.json()
    reviews_list.extend(data['response']['docs'])

    # Print the page that was just retrieved
    print(f'Checked page {page_nbr}')

    # Print the page number that had no results then break from the loop
    if len(data['response']['docs']) == 0:
        print(f'{page_nbr} is empty.')
        break;
    
    # increment page
    page_nbr += 1
    
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)


Checked page 1
Checked page 2
Checked page 3
Checked page 4
Checked page 5
Checked page 6
Checked page 7
Checked page 8
Checked page 9
Checked page 10
Checked page 11
Checked page 12
Checked page 13
Checked page 14
Checked page 15
Checked page 16
Checked page 17
Checked page 18
Checked page 19


In [40]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews_list,indent=4))

[
    {
        "web_url": "https://www.nytimes.com/2023/01/31/movies/pamela-a-love-story-review.html",
        "snippet": "This documentary from Ryan White rewinds, to powerful effect, on Pamela Anderson\u2019s life and fame.",
        "source": "The New York Times",
        "headline": {
            "main": "\u2018Pamela, a Love Story\u2019 Review: A Frank Look Back",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "Pamela, a Love Story",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "subject",
                "value": "Documentary Films and Programs",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "persons",
                "value": "Anderson, Pamela (1967- )",
                "rank": 2,
                "major": "N"
            },
            {
                "name": "persons",

In [8]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
df = pd.json_normalize(reviews_list)
df.head()

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/01/31/movies/pame...,"This documentary from Ryan White rewinds, to p...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-01-31T12:00:05+0000,295,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
1,https://www.nytimes.com/2023/01/19/movies/in-f...,"In Matt Carter’s gay rugby film, sports and ro...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T17:50:16+0000,281,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",
2,https://www.nytimes.com/2023/01/19/movies/afte...,In this intelligent melodrama by the director ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T12:00:06+0000,359,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",
3,https://www.nytimes.com/2023/01/05/movies/alca...,"In this naturalistic drama from Spain, a famil...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-05T12:00:03+0000,306,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",
4,https://www.nytimes.com/2022/12/15/movies/nell...,A family archive provides intimate records of ...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2022-12-15T12:00:04+0000,308,"‘Nelly & Nadine’ Review: An Unlikely Love, an ...",,,Nelly &amp; Nadine,,,,By Teo Bugbee,"[{'firstname': 'Teo', 'middlename': None, 'las...",


In [9]:
def extract_title(x):
    try:
        # find start and end index
        start_idx = x.index('‘')+1
        end_idx = x.index('’')

        # After getting blank titles and doing research
        # I found that some strings had multiple of the closing comma
        # this while loop handles finding the correct closing character if there are multiple
        while end_idx < start_idx:
            end_idx = x[end_idx+1 : len(x)].index('’') + end_idx + 1

        # build string with the title
        s = x[int(start_idx): int(end_idx)]

        # check if title is blank and set as NaN or return the title
        return s if s != '' else np.nan
    except:
        return np.nan

# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early

df['Title'] = df['headline.main'].apply(extract_title)

# clear rows that have a blank Title
#df.dropna(subset=['Title'], inplace=True, how='any', axis = 'rows')

# print 5 rows to see the new Title column
df.head()

# check there are no blank rows
#df['Title'].loc[df['Title'] == '']

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,Title
0,https://www.nytimes.com/2023/01/31/movies/pame...,"This documentary from Ryan White rewinds, to p...",The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2023-01-31T12:00:05+0000,295,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,"Pamela, a Love Story"
1,https://www.nytimes.com/2023/01/19/movies/in-f...,"In Matt Carter’s gay rugby film, sports and ro...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T17:50:16+0000,281,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",,In From the Side
2,https://www.nytimes.com/2023/01/19/movies/afte...,In this intelligent melodrama by the director ...,The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-19T12:00:06+0000,359,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",,After Love
3,https://www.nytimes.com/2023/01/05/movies/alca...,"In this naturalistic drama from Spain, a famil...",The New York Times,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-01-05T12:00:03+0000,306,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",,Alcarràs
4,https://www.nytimes.com/2022/12/15/movies/nell...,A family archive provides intimate records of ...,The New York Times,"[{'name': 'subject', 'value': 'Documentary Fil...",2022-12-15T12:00:04+0000,308,"‘Nelly & Nadine’ Review: An Unlikely Love, an ...",,,Nelly &amp; Nadine,,,,By Teo Bugbee,"[{'firstname': 'Teo', 'middlename': None, 'las...",,Nelly & Nadine


In [10]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
df['keywords'] = df['keywords'].apply(extract_keywords)
df['keywords'].head()

0    subject: Documentary Films and Programs;person...
1    subject: Movies;creative_works: In From the Si...
2    subject: Movies;creative_works: After Love (20...
3    subject: Movies;persons: Simon, Carla;creative...
4    subject: Documentary Films and Programs;person...
Name: keywords, dtype: object

In [11]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
title_list = df['Title'].to_list()
title_list

['Pamela, a Love Story',
 'In From the Side',
 'After Love',
 'Alcarràs',
 'Nelly & Nadine',
 'Lady Chatterley',
 'The Sound of Christmas',
 'The Inspection',
 'Bones and All',
 'My Policeman',
 'About Fate',
 'Waiting for Bojangles',
 'I Love My Dad',
 'A Love Song',
 'Alone Together',
 'Art of Love',
 'The Wheel',
 'Thor: Love and Thunder',
 'Both Sides of the Blade',
 'Fire of Love',
 'Love & Gelato',
 'Stay Prayed Up',
 'Benediction',
 'Dinner in America',
 'In a New York Minute',
 'Anaïs in Love',
 'I Love America',
 'See You Then',
 'La Mami',
 'Love After Love',
 'Deep Water',
 'Lucy and Desi',
 'Cyrano',
 'The In Between',
 'Book of Love',
 'Lingui, the Sacred Bonds',
 'The Pink Cloud',
 'A Journal for Jordan',
 'West Side Story',
 'Aulcie',
 'Love Is Love Is Love',
 'Love Hard',
 'Bergman Island',
 'Hard Luck Love Song',
 'South of Heaven',
 'Wife of a Spy',
 'Happier Than Ever',
 'Together',
 'Annette',
 'Resort to Love',
 'Woodstock 99: Peace, Love and Rage',
 'Casanova, Las

### Access The Movie Database API

In [12]:
# Prepare The Movie Database query

tmdb_key_string = f"?api_key={tmdb_api_key}"
url_search = f"https://api.themoviedb.org/3/search/movie{tmdb_key_string}&query="
url_detail = f"https://api.themoviedb.org/3/movie/"

In [16]:
# Test out API calls

test_resp = requests.get(url_search + title_list[0].replace(' ', '+')).json()['results'][0]

test_resp2 = requests.get(url_detail + str(test_resp['id']) + tmdb_key_string).json()

#resp2['genres'] = [{'id': 99, 'name': 'Documentary'}]
#resp2['production_countries'] = [{'iso_3166_1': 'US','name': 'United States of America'}]
#resp2['spoken_languages'] = [{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}]

test_resp2

{'adult': False,
 'backdrop_path': '/aJvgoRMrYFTMu7MrcfxYVPMRuU5.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 99, 'name': 'Documentary'}],
 'homepage': 'https://www.netflix.com/title/81590934',
 'id': 1061671,
 'imdb_id': 'tt18376122',
 'origin_country': ['US'],
 'original_language': 'en',
 'original_title': 'Pamela, A Love Story',
 'overview': 'In her own words, through personal video and diaries, Pamela Anderson shares the story of her rise to fame, rocky romances and infamous sex tape scandal.',
 'popularity': 17.091,
 'poster_path': '/zkVnRwZWbrd55P3Tx7BZZQ1gU89.jpg',
 'production_companies': [{'id': 90561,
   'logo_path': None,
   'name': 'Tripod Media',
   'origin_country': 'US'},
  {'id': 135005,
   'logo_path': '/yHs7fh3S2JEDdYX5B0UrA5lSTjN.png',
   'name': 'Dorothy St Pictures',
   'origin_country': 'GB'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '2023-01-30',
 'revenue': 0,
 'runtime': 

In [28]:
# static map to prevent making more callouts if I have the title stored
tmdb_movies_map = {}

In [31]:
# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 1

# Loop through the titles
for title in title_list:

    if title in tmdb_movies_map:
        print('title already found.')
        continue

    # Check if we need to sleep before making a request
    if request_counter % 50 == 0:
        print('Sleeping for 1 second.')
        time.sleep(1)

    # Add 1 to the request counter
    request_counter += 1

    
    # Perform a "GET" request for The Movie Database
    resp1

    try:
        resp1 = requests.get(url_search + title.replace(' ', '+')).json()['results']
    except:
        print('Failed on search', title)
        continue

    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
    try:
        resp1 = resp1[0]
        # Get movie id
        # Make a request for a the full movie details
        # Execute "GET" request with url
        resp2 = requests.get(url_detail + str(resp1['id']) + tmdb_key_string).json()
        
        # Extract the genre names into a list
        # Extract the spoken_languages' English name into a list
        # Extract the production_countries' name into a list
        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list
        data = {
            'title': title,
            'original_title': resp2['original_title'],
            'budget': resp2['budget'],
            'original_language': resp2['original_language'],
            'homepage': resp2['homepage'],
            'overview': resp2['overview'],
            'popularity': resp2['popularity'],
            'runtime': resp2['runtime'],
            'revenue': resp2['revenue'],
            'release_date': resp2['release_date'],
            'vote_average': resp2['vote_average'],
            'vote_count': resp2['vote_count'],
            'genres': resp2['genres'],
            'spoken_languages': resp2['spoken_languages'],
            'production_countries': resp2['production_countries']
        }
        
        tmdb_movies_list.append(data)
        tmdb_movies_map[title] = data
        
        # Print out the title that was found
        print(f'Found Title: "{title}"')
    except Exception as error:
        print(error, f'Title "{title}" not found.')


Found Title: "Pamela, a Love Story"
Found Title: "In From the Side"
Found Title: "After Love"
Found Title: "Alcarràs"
Found Title: "Nelly & Nadine"
Found Title: "Lady Chatterley"
Found Title: "The Sound of Christmas"
Found Title: "The Inspection"
Found Title: "Bones and All"
Found Title: "My Policeman"
Found Title: "About Fate"
Found Title: "Waiting for Bojangles"
Found Title: "I Love My Dad"
Found Title: "A Love Song"
Found Title: "Alone Together"
Found Title: "Art of Love"
Found Title: "The Wheel"
Found Title: "Thor: Love and Thunder"
Found Title: "Both Sides of the Blade"
Found Title: "Fire of Love"
Found Title: "Love & Gelato"
Found Title: "Stay Prayed Up"
Found Title: "Benediction"
Found Title: "Dinner in America"
Found Title: "In a New York Minute"
Found Title: "Anaïs in Love"
Found Title: "I Love America"
Found Title: "See You Then"
Found Title: "La Mami"
Found Title: "Love After Love"
Found Title: "Deep Water"
Found Title: "Lucy and Desi"
Found Title: "Cyrano"
Found Title: "The

In [39]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(tmdb_movies_list, sort_keys=True, indent=4))

[
    {
        "budget": 0,
        "genres": [
            {
                "id": 99,
                "name": "Documentary"
            }
        ],
        "homepage": "https://www.netflix.com/title/81590934",
        "original_language": "en",
        "original_title": "Pamela, A Love Story",
        "overview": "In her own words, through personal video and diaries, Pamela Anderson shares the story of her rise to fame, rocky romances and infamous sex tape scandal.",
        "popularity": 17.091,
        "production_countries": [
            {
                "iso_3166_1": "US",
                "name": "United States of America"
            }
        ],
        "release_date": "2023-01-30",
        "revenue": 0,
        "runtime": 113,
        "spoken_languages": [
            {
                "english_name": "English",
                "iso_639_1": "en",
                "name": "English"
            }
        ],
        "title": "Pamela, a Love Story",
        "vote_average": 7.0,

In [45]:
# Convert the results to a DataFrame
tmdb_df = pd.DataFrame(tmdb_movies_list)

#df_tmdb_df.head()
tmdb_df.shape

(186, 15)

### Merge and Clean the Data for Export

In [53]:
# Merge the New York Times reviews and TMDB DataFrames on title
grouped_df = tmdb_df.merge(df, how='inner', left_on='title', right_on='Title')
grouped_df.head()

Unnamed: 0,title,original_title,budget,original_language,homepage,overview,popularity,runtime,revenue,release_date,...,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization,Title
0,"Pamela, a Love Story","Pamela, A Love Story",0,en,https://www.netflix.com/title/81590934,"In her own words, through personal video and d...",17.091,113,0,2023-01-30,...,,,"Pamela, a Love Story",,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,"Pamela, a Love Story"
1,In From the Side,In from the Side,0,en,http://www.infromthesidemovie.com,"Mark, a new and inexperienced rugby club membe...",14.059,134,0,2022-09-16,...,,,In From the Side,,,,By Kyle Turner,"[{'firstname': 'Kyle', 'middlename': None, 'la...",,In From the Side
2,After Love,After Love,0,en,,Set in the port town of Dover in the South-Eas...,15.106,89,0,2021-06-04,...,Critic’s Pick,,After Love,,,,By Beatrice Loayza,"[{'firstname': 'Beatrice', 'middlename': None,...",,After Love
3,After Love,After Love,0,en,,Set in the port town of Dover in the South-Eas...,15.106,89,0,2021-06-04,...,,,After Love,,,,By Glenn Kenny,"[{'firstname': 'Glenn', 'middlename': None, 'l...",,After Love
4,Alcarràs,Alcarràs,0,ca,https://www.alcarras-film.com/,"In a small village in Catalonia, the peach far...",11.358,120,0,2022-04-29,...,,,Alcarràs,,,,By Devika Girish,"[{'firstname': 'Devika', 'middlename': None, '...",,Alcarràs


In [57]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
columns_to_fix = ['byline.person','genres', 'spoken_languages','production_countries',]

# Create a list of characters to remove
characters_to_remove = ['[',']',"'"]

# Loop through the list of columns to fix
for col in columns_to_fix:

    # Convert the column to type 'str'
    grouped_df[col] = grouped_df[col].astype(str)

    # Loop through characters to remove
    for s in characters_to_remove:
        grouped_df[col] = grouped_df[col].str.replace(s,'')

# Display the fixed DataFrame
grouped_df[columns_to_fix].head()

Unnamed: 0,byline.person,genres,spoken_languages,production_countries
0,"{firstname: Glenn, middlename: None, lastname:...","{id: 99, name: Documentary}","{english_name: English, iso_639_1: en, name: E...","{iso_3166_1: US, name: United States of America}"
1,"{firstname: Kyle, middlename: None, lastname: ...","{id: 18, name: Drama}, {id: 10749, name: Romance}","{english_name: English, iso_639_1: en, name: E...","{iso_3166_1: GB, name: United Kingdom}"
2,"{firstname: Beatrice, middlename: None, lastna...","{id: 18, name: Drama}","{english_name: English, iso_639_1: en, name: E...","{iso_3166_1: GB, name: United Kingdom}"
3,"{firstname: Glenn, middlename: None, lastname:...","{id: 18, name: Drama}","{english_name: English, iso_639_1: en, name: E...","{iso_3166_1: GB, name: United Kingdom}"
4,"{firstname: Devika, middlename: None, lastname...","{id: 18, name: Drama}","{english_name: Catalan, iso_639_1: ca, name: C...","{iso_3166_1: ES, name: Spain}"


In [62]:
# Drop "byline.person" column
grouped_df = grouped_df.drop('byline.person', axis=1)
grouped_df.head()

Unnamed: 0,title,original_title,budget,original_language,homepage,overview,popularity,runtime,revenue,release_date,...,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.organization,Title
0,"Pamela, a Love Story","Pamela, A Love Story",0,en,https://www.netflix.com/title/81590934,"In her own words, through personal video and d...",17.091,113,0,2023-01-30,...,"‘Pamela, a Love Story’ Review: A Frank Look Back",,,"Pamela, a Love Story",,,,By Glenn Kenny,,"Pamela, a Love Story"
1,In From the Side,In from the Side,0,en,http://www.infromthesidemovie.com,"Mark, a new and inexperienced rugby club membe...",14.059,134,0,2022-09-16,...,‘In From the Side’ Review: Love and Rugby Play...,,,In From the Side,,,,By Kyle Turner,,In From the Side
2,After Love,After Love,0,en,,Set in the port town of Dover in the South-Eas...,15.106,89,0,2021-06-04,...,‘After Love’ Review: The Other Woman,Critic’s Pick,,After Love,,,,By Beatrice Loayza,,After Love
3,After Love,After Love,0,en,,Set in the port town of Dover in the South-Eas...,15.106,89,0,2021-06-04,...,Review: Staying Together ‘After Love’ and Regr...,,,After Love,,,,By Glenn Kenny,,After Love
4,Alcarràs,Alcarràs,0,ca,https://www.alcarras-film.com/,"In a small village in Catalonia, the peach far...",11.358,120,0,2022-04-29,...,‘Alcarràs’ Review: Labor of Love,,,Alcarràs,,,,By Devika Girish,,Alcarràs


In [64]:
# Note: I should have de-duplicated already when I called out to IMDB. However the NYT Reviews could have still had duplicates
#grouped_df.shape

# Delete duplicate rows and reset index
grouped_df.drop_duplicates(inplace=True)
grouped_df.reset_index()
grouped_df.shape

(188, 31)

In [65]:
# Export data to CSV without the index
grouped_df.to_csv('output/output.csv', index=False)

In [114]:
# Using this block to test the substringing of Titles

# test = '‘The Attachment Diaries’ Review: Love, Sick	'
# test = 'Review: It’s All Mirth and Taxes in ‘Love & Taxes’'
# start_idx = test.index('‘')+1

# end_idx = test.index('’')

# while end_idx < start_idx:
#     print(end_idx, start_idx)
#     end_idx = test[end_idx+1 : len(test)].index('’') + end_idx + 1



# print(start_idx, end_idx)
# test = test[int(start_idx): int(end_idx)]
# test

10 37
37 49


'Love & Taxes'