# NY Times and Twitter Data Mining

### Author: Kashish Shah and Nandana Yadla
### Course: CIS 600 Social Media and Data Mining
### Date: 12/14/2018

This application provides user to establish a correlation between reviews for movies and books. The **movie reviews** are obtained using **New York Times API, Twitter API and TMDB API (The Movie Database or TMDB API has one of the largest database for movies)** and the **book reviews** are harvested from **New York Times API, Twitter API and Good Reads data (Goodreads is one of the most famous platform which is well-known among the avid book readers to get a review of their chosen book)**. A **sentiment analysis** is performed on this harvested data from different APIs. The sentiment analysis is performed using **IBM Tone Analyzer API** that provides sentiments based on the **emotional and language tone** of the text. **Plotly** is used to showcase various visualizations and to compare the reviews provided by different platforms.

### Import all the required packages 

In [16]:
import json
import requests
from pprint import pprint
import pymongo
import pandas as pd
import plotly.plotly as py
from ipywidgets import widgets
from IPython.display import display, clear_output, Image
import plotly.graph_objs as go
from plotly.widgets import GraphWidget
import urllib.request
import requests
from bs4 import BeautifulSoup
from watson_developer_cloud import ToneAnalyzerV3
from watson_developer_cloud import WatsonApiException
import tweepy
import re
import csv

### Creation of a .json file that contains all the access keys and tokens for using different APIs in this application

In [2]:
credentials = dict()
credentials['NYT_MOVIE_KEY'] = '69c7380ba1b64dfab0047124550ec0b2' #New York Times API Key for movie reviews
credentials['NYT_BOOK_KEY'] = '69c7380ba1b64dfab0047124550ec0b2' #New York Times API Key for book reviews
credentials['TMDB_MOVIE_KEY'] = '26dab1009eaec42afb0f19cde440f96f' #TMDB API Key for movie reviews
credentials['IBM_TONE_KEY'] = 'BvA2QcZefY7VR2yBDSN1TZhBWEJXOxUS0yyQHpmTnUJ-' #IBM Tone Analyzer API Key for movie reviews
credentials['CONSUMER_KEY'] = 'pT2QEM4RPKHiwLKC2JJEJlIg4' 
credentials['CONSUMER_SECRET'] = 'AUF8QLLfLJdgzWMuYRpE3X6g3EGQlUUs2sKs4uOVON8AIR25gR'
credentials['ACCESS_KEY'] = '786452312-hddYCXCXcfgubjp9Loz016K2MY7SwhY7c7lcNurm'
credentials['ACCESS_SECRET'] = 'GtSaNMXWMbiZCXCt6HPFCvMi3N33bhdgqcZa2tUKGcTmZ'

with open('credentials.json', 'w') as secret_info:
    json.dump(credentials, secret_info, indent = 4, sort_keys = True)

### This function makes a call to the New York Times API and gets a json response of the movies matching the input string

In [3]:
def nyt_search_movie(movie_name):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        nyt_movie_key = info['NYT_MOVIE_KEY']
    url = "http://api.nytimes.com/svc/movies/v2/reviews/search.json?api-key="+nyt_movie_key+"&query="+movie_name
    result = requests.get(url)
    result = json.loads(result.content) #json response
    return result

## This function extracts the necessary part of the result obtained for the movie name as per the input and gives a response of a link of review of that particular movie

In [4]:
def get_nyt_movie_review_link(result,movie_name):
    if(result['num_results']==0):
        return ''
    else:
        movielinkdict = {}
        for i in range(0,result['num_results']):
            if result['results'][i]['display_title'] == movie_name:
                temp_movie = result['results'][i]['display_title']
                movielinkdict[temp_movie] = result['results'][i]['link']['url'] #selects the link from json response
                return movielinkdict[temp_movie] #returns the link of the movie

## This functions makes a search on TMDB using the input of the movie name provided by the user and fetches its movie id. Using this movie id, it searches the review of that movie and return review in string format in response.

In [5]:
def tmdb_search_movie(movie_name):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        tmdb_key = info['TMDB_MOVIE_KEY']
    url = 'https://api.themoviedb.org/3/search/movie?api_key='+tmdb_key+'&language=en-US&query='+movie_name+'&include_adult=false'
    result = requests.get(url)
    result = json.loads(result.content)
    movie_id = 1
    name = ''
    for i in range(0,len(result['results'])):
        if result['results'][i]['title']==movie_name:
            name = result['results'][i]['title']
            movie_id = result['results'][i]['id'] #gets movie id for a particular movie name
            break
    
    url2 = 'https://api.themoviedb.org/3/movie/'+str(movie_id)+'/reviews?api_key='+tmdb_key+'&language=en-US' #make a search based on movie id
    result2 = requests.get(url2)
    result2 = json.loads(result2.content)
    review = ''
    for i in range(0,len(result2['results'])):
        review = review+result2['results'][i]['content'] #extract and append all the reviews by different reviewers of TMDB
        return movie_name,review #return movie name and its review
    return movie_name,review

## These functions are used to add movie name, its link and reviews to the mongoDb database. Further instructions on how it works is provided in the comments. 

### Note: The movie reviews are added only for some 100 movies into the database because of the time required to call the API. We can easily add more movies into the database using this function. 

### However, the application performs a real time search on the API when a user enters a name of a hollywood movie and is not limited to any specific names. Though it might happend that a review for some movie can't exist.

In [6]:
# The function tmdb_add_to_db() gives a response of a review from TMDB.

def tmdb_add_to_db(movie_name,movie_id):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        tmdb_key = info['TMDB_MOVIE_KEY']
    
    url2 = 'https://api.themoviedb.org/3/movie/'+str(movie_id)+'/reviews?api_key='+tmdb_key+'&language=en-US' #make a search based on movie id
    result2 = requests.get(url2)
    result2 = json.loads(result2.content)
    review = ''
    for i in range(0,len(result2['results'])):
        review = review+result2['results'][i]['content'] #extract and append all the reviews by different reviewers of TMDB
        return movie_name,review 
    return movie_name,review

# The function add_to_mongoDb() is used to add these movies into the database. It has three attributes namely Name, Link and Review.
def add_to_mongoDb():
    # Uncomment this commented part to add more movies into the database.
    '''
    movie_names_df = pd.read_csv('data/tmdb_movies.csv')
    movie_dict = {}
    movie_link_dict = {}
    movie_review_dict = {}
    movie_dict_dblist = []

    for idx in range(0,100):
        movie_dict["Name"] = movie_names_df['title'][idx]
        nytresult = nyt_search_movie(movie_names_df['title'][idx])
        movielinkdict = get_nyt_movie_review_link(nytresult,movie_names_df['title'][idx])
        movie_link_dict['Link'] = movielinkdict
        movie_dict.update(movie_link_dict)
        name,review = tmdb_add_to_db(movie_names_df['title'][idx],movie_names_df['movie_id'][idx])
        movie_review_dict['Review'] = review
        movie_dict.update(movie_review_dict)
        movie_dict_dblist.append(movie_dict.copy())
    '''
    # Creating a mongo client and creating a database and its collection
    myclient = pymongo.MongoClient('mongodb://localhost:27017/')
    mydb = myclient["projectdb"]
    moviecol = mydb["movie_review"]
    
    # Uncomment this below command to insert more movies into a database
    '''x = moviecol.insert_many(movie_dict_dblist)'''
    for x in moviecol.find():
        pprint(x)
        
    # Uncomment this below command to drop the entire database
    '''myclient.drop_database("projectdb")'''
    return

## Run the following command to see output of the data stored in mongoDb database

In [None]:
add_to_mongoDb()

## This function makes a search using New York Times API to get a review for a specific book name

In [7]:
def nyt_search_book(book_name):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        nyt_book_key = info['NYT_BOOK_KEY']
    url = "https://api.nytimes.com/svc/books/v3/reviews.json?api-key="+nyt_book_key+"&title="+book_name
    result = requests.get(url)
    result = json.loads(result.content)
    return result

## This fuction gives a response of a link to the review of a book name provided by the user

In [8]:
def get_nyt_book_review_link(result,book_name):
    if result['num_results']==0:
        return ''
    else:
        booklinkdict = {}
        for i in range(0,result['num_results']):
            if result['results'][i]['book_title'].lower() == book_name:
                temp_book = result['results'][i]['book_title']
                booklinkdict[temp_book] = result['results'][i]['url']
                return booklinkdict[temp_book]

## The following function is used to scrape text from the review links provided and get the review as text.

In [9]:
def scrape_text(url):
    try:
        html = requests.get(url).content
        '''convert html to BeautifulSoup object'''
        soup = BeautifulSoup(html , 'lxml')
        paragraphs = [par.text for par in soup.find_all('p')]
        text = '\n'.join(paragraphs)
        return text
    except:
        return None

## This function uses IBM Tone Analyzer API to identify the sentiment in the review text

In [10]:
def get_sentiment(text):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        ibm_tone_key = info['IBM_TONE_KEY']
    tone_analyzer = ToneAnalyzerV3(
        version='2018-11-26', #Using the latest version
        iam_apikey=ibm_tone_key,
        url='https://gateway.watsonplatform.net/tone-analyzer/api'
    )
    try:
        tone_analysis = tone_analyzer.tone(
        {'text': text},
        'application/json', #getting json response
        sentences = False # set this variable as "True" to get sentiments of each individual sentences in the review text
        ).get_result()
        sentiment = tone_analysis
        return tone_analysis
    except WatsonApiException as ex:
        print("Method failed with status code " + str(ex.code) + ": " + ex.message)

## This function extracts the list of tones and the corresponding scores from the output of the above function

In [11]:
def get_tone(sentiment):
    try:
        if sentiment['document_tone']['tones'] == []:
            return [0], [0]
        else:
            tone_list = []
            tone_score = []
            for tone in sentiment['document_tone']['tones']:
                tone_list.append(tone['tone_name'])
                tone_score.append(tone['score'])
            return tone_list,tone_score
    except:
        print("")

## The following functions are used for extracting tweets from twitter and processing those tweets to obtain as much text as possible

In [12]:
# The clean_tweet() function is used to clean the tweets that are obtained

def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

# The following function fetches the tweets
def get_tweets(text):
    with open('credentials.json') as cred_data:
        info = json.load(cred_data)
        consumer_key = info['CONSUMER_KEY']
        consumer_secret = info['CONSUMER_SECRET']
        access_key = info['ACCESS_KEY']
        access_secret = info['ACCESS_SECRET']

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth)

    n = 50 #maximum number of tweets to be obtained

    hashtag = text
    tweets = []
    
    for tweet in tweepy.Cursor(api.search, q='#'+hashtag, rpp=100).items(n):
        cleaned_tweet = clean_tweet(str(tweet.text.encode('utf-8'))) # getting clean tweets
        tweets.append(cleaned_tweet)
        
    return tweets

#The following function is used to get the review of the twitter data that is being obtained for a particular movie or a book
def get_tweets_review(name):
    try:
        tweets = get_tweets(name)
        string = ''
        for tweet in tweets:
            string = string + tweet
        tone_analysis = get_sentiment(string)
        #print(tone_analysis['document_tone']['tones'])
        tone_list,tone_score = get_tone(tone_analysis)
    except:
        tone_list = [0]
        tone_score = [0]
    return tone_list,tone_score

## The following function uses all the functions explained above to get the sentiments of the review of a movie. Go to the function and you can get an idea about what it does from the explanation stated above it.

In [13]:
def get_movie_review(name):
    try:
        result = nyt_search_movie(name)
        link = get_nyt_movie_review_link(result,name)
        nyt_review = scrape_text(link)
        nyt_sentiment = get_sentiment(nyt_review)
        tone_list_nyt,tone_score_nyt = get_tone(nyt_sentiment)
    except:
        print("")
        tone_list_nyt = [0]
        tone_score_nyt = [0]
        
    try:
        movie_name,tmdb_review = tmdb_search_movie(name)
        tmdb_sentiment = get_sentiment(tmdb_review)
        tone_list_tmdb,tone_score_tmdb = get_tone(tmdb_sentiment)
    except:
        print("")
        tone_list_tmdb = [0]
        tone_score_tmdb = [0]
        
    tweet_tone_list, tweet_tone_score = get_tweets_review(name)
    return tone_list_tmdb,tone_score_tmdb,tone_list_nyt,tone_score_nyt,tweet_tone_list,tweet_tone_score,name

## The following function uses all the functions explained above to get the sentiments of the review of a movie. Go to the function and you can get an idea about what it does from the explanation stated above it.

In [14]:
def get_book_review(name):
    try:
        result = nyt_search_book(name)
        link = get_nyt_book_review_link(result,name)
        nyt_review = scrape_text(link)
        nyt_sentiment = get_sentiment(nyt_review)
        tone_list_nyt,tone_score_nyt = get_tone(nyt_sentiment)
    except:
        print("")
        tone_list_nyt=[0]
        tone_score_nyt=[0]
        
    try:
        
        # Here the review from Goodreads database is taken for the required book
    
        book_details_df = pd.read_csv('data/goodreads.csv')

        gr_review = book_details_df.loc[book_details_df['title'].str.lower() == name,'review'].iloc[0]

        gr_sentiment = get_sentiment(gr_review)
        tone_list_gr,tone_score_gr = get_tone(gr_sentiment)
    except:
        print("")
        tone_list_gr = [0]
        tone_score_gr = [0]
    tweet_tone_list, tweet_tone_score = get_tweets_review(name)
    return tone_list_nyt,tone_score_nyt,tone_list_gr,tone_score_gr,tweet_tone_list,tweet_tone_score,name


## The following cell manages the visualizations of different pie charts, bar graphs, table and bubble chart.

## Kindly wait for the output once you run this last cell as it takes time to load all the charts and graphs and also to fetch data from the API.
### If it doesn't show any plot then the review for that particular movie or book name doesn't exist

In [15]:
# The following function manages the updations in the graphs and charts whenever a user changes or adds an input name of a book or a movie
def response(change):
    if w.value=="Movies": # the updates in the graphs and charts if "movie" is selected in the dropdown
        tone_list_tmdb,tone_score_tmdb,tone_list_nyt,tone_score_nyt,tweet_tone_list, tweet_tone_score,movie_name = get_movie_review(text_input.value)
        with g1.batch_update():
            g1.data[0].labels=tone_list_nyt
            g1.data[0].values=tone_score_nyt
            g1.layout.title = 'NY Times Sentiment (%) for '+movie_name
            g2.layout.title = 'NY Times Sentiment Chart for '+movie_name
            g2.data[0].x=tone_list_nyt
            g2.data[0].y=tone_score_nyt
            g4.data[0].labels=tone_list_tmdb    
            g4.data[0].values=tone_score_tmdb
            g4.layout.title = 'TMDB Sentiment (%) for '+movie_name
            g5.layout.title = 'TMDB Sentiment Chart for '+movie_name
            g5.data[0].x=tone_list_tmdb
            g5.data[0].y=tone_score_tmdb
            g6.data[0].x=tone_list_nyt
            g6.data[0].y=tone_score_nyt
            g6.data[1].x=tone_list_tmdb
            g6.data[1].y=tone_score_tmdb
            g6.data[1].name = 'TMDB'
            g6.data[2].x=tweet_tone_list
            g6.data[2].y=tweet_tone_score
            g6.layout.title = 'Sentiment Comparisons for '+movie_name
            g7.data[0].labels=tweet_tone_list
            g7.data[0].values=tweet_tone_score
            g7.layout.title = 'Twitter Sentiment (%) for '+movie_name
            g8.layout.title = 'Twitter Sentiment Chart for '+movie_name
            g8.data[0].x=tweet_tone_list
            g8.data[0].y=tweet_tone_score
        message2.value = ""
    if w.value =="Books": # the updates in the graphs and charts if "book" is selected in the dropdown
        tone_list_nyt,tone_score_nyt,tone_list_gr,tone_score_gr,tweet_tone_list, tweet_tone_score,book_name = get_book_review(text_input.value.lower())
        with g1.batch_update():
            g1.data[0].labels=tone_list_nyt
            g1.data[0].values=tone_score_nyt
            g1.layout.title = 'NY Times Sentiment (%) for '+book_name
            g2.layout.title = 'NY Times Sentiment Chart for '+book_name
            g2.data[0].x=tone_list_nyt
            g2.data[0].y=tone_score_nyt
            g4.data[0].labels=tone_list_gr    
            g4.data[0].values=tone_score_gr
            g4.layout.title = 'Goodreads Sentiment (%) for '+book_name
            g5.layout.title = 'Goodreads Sentiment Chart for '+book_name
            g5.data[0].x=tone_list_gr
            g5.data[0].y=tone_score_gr
            g6.data[0].x=tone_list_nyt
            g6.data[0].y=tone_score_nyt
            g6.data[1].x=tone_list_gr
            g6.data[1].y=tone_score_gr
            g6.data[1].name = 'Goodreads'
            g6.data[2].x=tweet_tone_list
            g6.data[2].y=tweet_tone_score
            g6.layout.title = 'Sentiment Comparisons for '+book_name
            g7.data[0].labels=tweet_tone_list
            g7.data[0].values=tweet_tone_score
            g7.layout.title = 'Twitter Sentiment (%) for '+book_name
            g8.layout.title = 'Twitter Sentiment Chart for '+book_name
            g8.data[0].x=tweet_tone_list
            g8.data[0].y=tweet_tone_score

# Dropdown menu
w = widgets.Dropdown(
    options= ['Movies', 'Books'],
    value='Movies', # initializing it with "Movie" option
    description='Select:',
)

# Submit buttion
button = widgets.Button(description="Submit")

#Text box for user to input a name of movie or book
text_input = widgets.Text(
    description='Name:',
    value='Spectre',
)

tone_list_tmdb,tone_score_tmdb,tone_list_nyt,tone_score_nyt,tweet_tone_list, tweet_tone_score,movie_name = get_movie_review('Spectre')

trace1 = go.Pie(labels=tone_list_nyt, values=tone_score_nyt,opacity=0.7) # Pie chart of sentiment score for NY Times review
trace2 = go.Bar(              # Bar graph of sentiment score for NY Times review
    x = tone_list_nyt,
    y = tone_score_nyt,
    name = "Sentiment",
    opacity=0.7
)
trace3 = go.Table(            # Table to show a sentiment name and its description
    columnorder = [1,2],
    columnwidth = [5,20],
    header=dict(
        values=['<b>Sentiment type</b><br>', '<b>DESCRIPTION</b>'],
                line = dict(color='#7D7F80'),
                fill = dict(color='#a1c3d1'),
                align = ['center','center']),
        cells=dict(values=[['Anger','Fear', 'Joy','Sadness','Analytical','Confident','Tentative'],
                       ['Anger is evoked due to humiliation, conflict or negligence. There is a verbal attack.',
                 'Fear is a response to impending danger. Fear can be a mild caution or an extreme phobia.',
                 'Joy (or happiness) has shades of enjoyment, satisfaction, and pleasure.',
                 'Sadness indicates a feeling of loss and disadvantage.',
                 'An analytical tone indicates reasoning and analytical attitude about things. It is often rational.',
                 'A confident tone indicates a degree of certainty.',
                 'A tentative tone indicates a degree of inhibition or doubt.']],
               line = dict(color='#7D7F80'),
               fill = dict(color='#EDFAFF'),
               align = ['center','left'])
)
trace4 = go.Pie(labels=tone_list_tmdb, values=tone_score_tmdb,opacity=0.7) # Pie chart of sentiment score for TMDB/Good Reads data

trace5 = go.Bar(              # Bar graph of sentiment score for TMDB/Goodreads data
    x = tone_list_tmdb,
    y = tone_score_tmdb,
    name = "Sentiment",
    opacity=0.7
)
trace6 = go.Scatter(          # Bubble charts information of NY Times review for comparison
    x = tone_list_nyt,
    y = tone_score_nyt,
    mode='markers',
    opacity = 0.7,
    name = 'NY Times',
    text = 'Sentiment Score',
    marker=dict(
        symbol = 'circle',
        size = 50,
        color = 'rgb(255, 144, 14)',
        line=dict(
            width=2
        ),
    )

)
trace7 = go.Scatter(         # Bubble charts information of TMDB/Goodreads review for comparison
    x = tone_list_tmdb,
    y = tone_score_tmdb,
    mode='markers',
    opacity = 0.7,
    name = 'TMDB',
    text = 'Sentiment Score',
    marker=dict(
        symbol = 'circle',
        size = 50,
        color = 'rgb(93, 164, 214)',
        line=dict(
            width=2
        ),
    )

)
trace8 = go.Scatter(         # Bubble charts information of Twitter review for comparison
    x = tweet_tone_list,
    y = tweet_tone_score,
    mode='markers',
    opacity = 0.7,
    name = 'Twitter',
    text = 'Sentiment Score',
    marker=dict(
        symbol = 'circle',
        size = 50,
        color = 'rgb(44, 160, 101)',
        line=dict(
            width=2
        ),
    )

)
trace9 = go.Pie(labels=tweet_tone_list, values=tweet_tone_score,opacity=0.7) # Pie chart of sentiment score for twitter review
trace10 = go.Bar(                 # Bar chart of sentiment score for twitter review data
    x = tweet_tone_list,
    y = tweet_tone_score,
    name = "Sentiment",
    opacity=0.7
)



g1 = go.FigureWidget(data=[trace1],
                layout=go.Layout(title='NY Times Sentiment(%) for '+movie_name))
g2 = go.FigureWidget(data=[trace2],
                layout=go.Layout(title='NY Times Sentiment Chart for '+movie_name,xaxis=dict(title='Sentiment Name'),yaxis=dict(title='Sentiment Score')))
g3 = go.FigureWidget(data=[trace3],
                layout=go.Layout(title='Sentiment Description Table'))
g4 = go.FigureWidget(data=[trace4],
                layout=go.Layout(title='TMDB Sentiment(%) for '+movie_name))
g5 = go.FigureWidget(data=[trace5],
                     layout=go.Layout(title='TMDB Sentiment Chart for '+movie_name,xaxis=dict(title='Sentiment Name'),yaxis=dict(title='Sentiment Score')))   

g6 = go.FigureWidget(data=[trace6,trace7,trace8],layout=go.Layout(title = 'Sentiment Comparisons for '+movie_name,xaxis=dict(title='Sentiment Name'),yaxis=dict(title='Sentiment Score')))

g7 = go.FigureWidget(data=[trace9],
                layout=go.Layout(title='Twitter Sentiment(%) for '+movie_name))
g8 = go.FigureWidget(data=[trace10],
                layout=go.Layout(title='Twitter Sentiment Chart for '+movie_name,xaxis=dict(title='Sentiment Name'),yaxis=dict(title='Sentiment Score')))


message = widgets.HTML(
    value="",
)

message2 = widgets.HTML(
    value="",
)

button.on_click(response)
w.observe(response, names="value")
#text_input.observe(response,names="value")
container1 = widgets.HBox(children=[text_input,button,message])
container2 = widgets.HBox(children=[g1,g2]) 
container3 = widgets.HBox(children=[g4,g5])
container4 = widgets.HBox(children=[g7,g8])
container5 = widgets.HBox(children=[g6])
container6 = widgets.VBox(children=[g3,w,container1,message2,container2,container3,container4,container5])
display(container6)

VBox(children=(FigureWidget({
    'data': [{'cells': {'align': [center, left],
                        'fill':…