# Boilerplate Setup

In [1]:
import requests
import json


# Optional, enable client-side caching for TMDB
# Requires: https://httpcache.readthedocs.org/en/latest/
#from httpcache import CachingHTTPAdapter
#tmdb_api.mount('https://', CachingHTTPAdapter())
#tmdb_api.mount('http://', CachingHTTPAdapter())

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 3.2.2 Indexing TMDB Movies

In [2]:
def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}

In [87]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings))

    bulkMovies = ""
    cnt = 0
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        item = json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
        bulkMovies += item
        if cnt % 500 == 0:
            print(f'{cnt} movies has been processed')
        cnt += 1

    print("indexing...")
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"content-type": "application/json"})
    print(resp)
    print("done!")


In [88]:
movieDict = extract()

In [89]:
# addCmd = {"index": {"_index": "tmdb", #E
#                             "_type": "movie",
#                             "_id": '93837'}}

# json.dumps(addCmd) + "\n" + json.dumps(movieDict['93837']) + "\n"

In [90]:
reindex(movieDict=movieDict)

building...
0 movies has been processed
500 movies has been processed
1000 movies has been processed
1500 movies has been processed
2000 movies has been processed
2500 movies has been processed
3000 movies has been processed
indexing...
<Response [200]>
done!


# 3.2.3 Basic Searching

In [91]:
def search(query):
    url = 'http://localhost:9200/tmdb/movie/_search'
    httpResp = requests.get(url, data=json.dumps(query), headers={"content-type": "application/json"}) #A
    searchHits = json.loads(httpResp.text)['hits']
    print("Num\tRelevance Score\t\tMovie Title\t\tOverview") #B
    for idx, hit in enumerate(searchHits['hits']):
            print("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))


In [92]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^10', 'overview'], #B
        },
    },
    'size': '20'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	86.512436		Aliens
2	72.18011		Cowboys & Aliens
3	70.14692		The Basketball Diaries
4	54.21631		Aliens vs Predator: Requiem
5	46.29787		Dances with Wolves
6	46.29787		Friends with Benefits
7	46.29787		Fire with Fire
8	40.53669		Interview with the Vampire
9	40.53669		From Russia With Love
10	40.53669		Gone with the Wind
11	40.53669		Just Go With It
12	40.53669		My Week with Marilyn
13	40.53669		From Paris with Love
14	36.050636		Die Hard: With a Vengeance
15	36.050636		Girl with a Pearl Earring
16	32.458572		The Girl with the Dragon Tattoo
17	32.458572		The Life Aquatic With Steve Zissou
18	32.458572		Twin Peaks: Fire Walk with Me
19	32.458572		You Don't Mess With the Zohan
20	32.458572		The Man with the Golden Gun


# 2.3.1 Query Validation API

In [93]:
query = {
   'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^10', 'overview']
        }
    }
}
httpResp = requests.get('http://localhost:9200' + 
			    '/tmdb/movie/_validate/query?explain',
			     data=json.dumps(query), headers={"content-type": "application/json"})
print(json.loads(httpResp.text))

{'_shards': {'total': 1, 'successful': 1, 'failed': 0}, 'valid': True, 'explanations': [{'index': 'tmdb', 'valid': True, 'explanation': '+((overview:basketball overview:with overview:cartoon overview:aliens) | (title:basketball title:with title:cartoon title:aliens)^10.0) #*:*'}]}


# 2.3.3 Debugging Analysis

In [94]:
# Inner Layer of the Onion -- Why did the search engine consider these movies matches? Two sides to this
# (1) What tokens are placed in the search engine?
# (2) What did the search engine attempt to match exactly?

# Explain of what's happening when we construct these terms

#resp = requests.get(elasticSearchUrl + "/tmdb/_mapping/movie/field/title?format=yaml'
text = {'text': 'Fire with Fire',
        'field': 'title',
        # 'analyzer': 'standard', #'english'
       }
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    data=json.dumps(text), headers={"content-type": "application/json"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



# 2.3.5 -- Solving The Matching Problem

In [None]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
            }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings))

    bulkMovies = ""
    cnt = 0
    print("building...")
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            "_type": "movie",
                            "_id": movie["id"]}}
        item = json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
        bulkMovies += item
        if cnt % 500 == 0:
            print(f'{cnt} movies has been processed')
        cnt += 1

    print("indexing...")
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers={"content-type": "application/json"})
    print(resp)
    print("done!")

In [131]:
mappingSettings = {
       'movie': {
            'properties': {
               'title': { #A
                   'type': 'text',
                   'analyzer': 'standard', # 'english',
               },
               'overview': {
                   'type': 'text',
                   'analyzer': 'standard', # 'english'
               }
            }
       }
}
# mappingSettings = {
#         'properties': {
#            'title': { #A
#                'type': 'text',
#                'analyzer': 'english',
#                # 'search_analyzer': 'simple'
#            },
#            'overview': {
#                'type': 'text',
#                'analyzer': 'english',
#                # 'search_analyzer': 'simple'
#            }
#         }
# }
reindex(mappingSettings=mappingSettings, movieDict=movieDict) 

building...
0 movies has been processed
500 movies has been processed
1000 movies has been processed
1500 movies has been processed
2000 movies has been processed
2500 movies has been processed
3000 movies has been processed
indexing...
<Response [200]>
done!


In [132]:
text = {'text': 'Fire with Fire',
        'field': 'title',
        'analyzer': 'english', #'english'
       }
resp = requests.get('http://localhost:9200/tmdb/_analyze?format=yaml', 
                    data=json.dumps(text), headers={"content-type": "application/json"})
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



## Repeat the search

In [148]:
usersSearch = 'basketball with cartoon aliens'
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch, #A
            'fields': ['title^0.1', 'overview'], #B
            'analyzer': 'english',
        },
    },
    'size': '100'
}
search(query)

Num	Relevance Score		Movie Title		Overview
1	7.3902683		The Flintstones
2	7.27343		Speed Racer
3	6.175305		Aliens vs Predator: Requiem
4	5.4407673		Invasion of the Body Snatchers
5	5.3837976		Space Jam
6	5.3837976		The Darkest Hour
7	5.3280087		Slither
8	5.273364		The Thing
9	5.273364		Escape from Planet Earth
10	5.056947		Independence Day
11	5.056947		Edge of Tomorrow
12	4.967671		The Last Starfighter
13	4.9201345		Avatar
14	4.909315		The X Files
15	4.7387524		Titan A.E.
16	4.695477		The Day the Earth Stood Still
17	4.6529856		Scary Movie 3
18	4.604123		Attack the Block
19	4.604123		Ghosts of Mars
20	4.49044		Outlander
21	4.4515624		Under the Skin
22	4.4515624		The Hitchhiker's Guide to the Galaxy
23	4.444916		Star Trek IV: The Voyage Home
24	4.444916		Battleship
25	4.338867		Contact
26	4.338867		Justice League: War
27	4.266855		The Host
28	4.1971936		Star Trek: Insurrection
29	4.1971936		Scary Movie 4
30	4.00122		Predators
31	4.00122		Lifted
32	3.9399004		Lilo & Stitch
33	3.8804312		

# 2.4.1	Decomposing Relevance Score With Lucene’s Explain

In [149]:
query

{'query': {'multi_match': {'query': 'basketball with cartoon aliens',
   'fields': ['title^0.1', 'overview'],
   'analyzer': 'english'}},
 'size': '100'}

In [151]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/movie/_search', data=json.dumps(query), headers={"content-type": "application/json"})
print(httpResp)
jsonResp = json.loads(httpResp.text)
# print(json.dumps(jsonResp['hits']['hits'][0]['_explanation'], indent=True))
print("Explain for %s" % jsonResp['hits']['hits'][4]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][4]['_explanation']))
# print "Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title']
# print simplerExplain(jsonResp['hits']['hits'][1]['_explanation'])
# print "Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title']
# print simplerExplain(jsonResp['hits']['hits'][2]['_explanation'])
# print "Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title']
# print simplerExplain(jsonResp['hits']['hits'][3]['_explanation'])
# print("Explain for %s" % jsonResp['hits']['hits'][10]['_source']['title'])
# print(simplerExplain(jsonResp['hits']['hits'][10]['_explanation']))

print("Explain for %s" % jsonResp['hits']['hits'][36]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][36]['_explanation']))


<Response [200]>
Explain for Space Jam
5.3837976, max of:
  5.3837976, sum of:
    5.3837976, weight(overview:alien in 1357) [PerFieldSimilarity], result of:
      5.3837976, score(freq=1.0), computed as boost * idf * tf from:
        2.2, boost
        3.9714398, idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
          57, n, number of documents containing term
          3050, N, total number of documents with field
        0.6161949, tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
          1.0, freq, occurrences of term within document
          1.2, k1, term saturation parameter
          0.75, b, length normalization parameter
          19.0, dl, length of field
          52.963608, avgdl, average length of field

Explain for Aliens
3.7667217, max of:
  3.7667217, sum of:
    3.7667217, weight(overview:alien in 454) [PerFieldSimilarity], result of:
      3.7667217, score(freq=1.0), computed as boost * idf * tf from:
        2.2, boost
        3.97143

# 3.4.4	Fixing Space Jam vs Alien Ranking

In [82]:
query = {
    'query': {
        'multi_match': { 
            'query': usersSearch,  #User's query
            'fields': ['title^0.1', 'overview'],
        }
    },
    'explain': True
}
search(query)


Num	Relevance Score		Movie Title		Overview
1	1.0016364		Space Jam
2	0.29594672		Grown Ups
3	0.28491083		Speed Racer
4	0.28491083		The Flintstones
5	0.2536686		White Men Can't Jump
6	0.2536686		Coach Carter
7	0.21968345		Semi-Pro
8	0.20324169		The Thing
9	0.1724563		Meet Dave
10	0.16911241		Teen Wolf
