In [1]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Data is downloaded from: https://grouplens.org/datasets/movielens/https://grouplens.org/datasets/movielens/

In [2]:
nRowsRead = None # specify 'None' if want to read whole file
# movie_metadata.csv has 5044 rows in reality, but we are only loading/previewing the first 1000 rows
size = "large"
movies = pd.read_csv(f'../data/movie-lens-{size}/movies.csv', delimiter=',', nrows = nRowsRead)
ratings = pd.read_csv(f'../data/movie-lens-{size}/ratings.csv', delimiter=',', nrows = nRowsRead)
tags = pd.read_csv(f'../data/movie-lens-{size}/tags.csv', delimiter=',', nrows = nRowsRead)

In [3]:
movies.shape

(62423, 3)

Let's take a quick look at what the data looks like:

In [4]:
pd.set_option('display.max_columns', None)

display(movies.head(5))
display(ratings.head(5))
display(tags.head(5))


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


### first time setup
- pipenv install "elasticsearch<8.0.0"
- docker pull docker.elastic.co/elasticsearch/elasticsearch:7.10.0
- docker run --name elasticsearch -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.10.0

### second time and beyond
- docker stop elasticsearch
- docker start elasticsearch

In [5]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch with HTTP (not HTTPS)
es = Elasticsearch(
    ['http://localhost:9200']  # Use HTTP instead of HTTPS
)

# Define the index name
index_name = 'movies_index_large'

# Check if the index exists
if es.indices.exists(index=index_name):
    # Delete the index
    es.indices.delete(index=index_name)
    print(f"Index '{index_name}' deleted.")
else:
    print(f"Index '{index_name}' does not exist.")

# Create the index
print(f"Index '{index_name}' creating...")

es.indices.create(index=index_name, ignore=400)

# Indexing documents from the dataframe
for i, row in movies.iterrows():
    doc = {
        'title': row['title'],
        'movieId': row['movieId'],
        'genres': row['genres']
    }
    if i % 500 == 0:
        print(f'indexing the {i}th docs')
    es.index(index=index_name, document=doc)
print(f"Index '{index_name}' created.")

Index 'movies_index_large' does not exist.
Index 'movies_index_large' creating...
indexing the 0th docs
indexing the 500th docs
indexing the 1000th docs
indexing the 1500th docs
indexing the 2000th docs
indexing the 2500th docs
indexing the 3000th docs
indexing the 3500th docs
indexing the 4000th docs
indexing the 4500th docs
indexing the 5000th docs
indexing the 5500th docs
indexing the 6000th docs
indexing the 6500th docs
indexing the 7000th docs
indexing the 7500th docs
indexing the 8000th docs
indexing the 8500th docs
indexing the 9000th docs
indexing the 9500th docs
indexing the 10000th docs
indexing the 10500th docs
indexing the 11000th docs
indexing the 11500th docs
indexing the 12000th docs
indexing the 12500th docs
indexing the 13000th docs
indexing the 13500th docs
indexing the 14000th docs
indexing the 14500th docs
indexing the 15000th docs
indexing the 15500th docs
indexing the 16000th docs
indexing the 16500th docs
indexing the 17000th docs
indexing the 17500th docs
indexi

In [6]:
def search_results_es(query_text, es, index_name, top_k):
    query = {
        "multi_match": {
            "query": query_text,
            "fields": ["title^2", "genres^0.5"]
        }
    }
    results = es.search(index=index_name, query=query, size=top_k)
    return [(hit['_source'].get('title'), hit['_source'].get('movieId'), hit['_source'].get('genres')) for hit in results['hits']['hits']]


In [7]:
# Example search
query_text = "toy"
search_results = search_results_es(query_text, es, index_name, top_k=10)
print(search_results)

[('Toy, The (1982)', 4929, 'Comedy'), ('Toy Story (1995)', 1, 'Adventure|Animation|Children|Comedy|Fantasy'), ('Toy Soldiers (1991)', 5843, 'Action|Drama'), ('Toy Soldiers (1984)', 139263, 'Action|Drama|Thriller'), ('Toy Masters (2014)', 143537, 'Documentary'), ('Toy Reanimator (2002)', 153234, 'Fantasy|Sci-Fi'), ('Toy Gun (2018)', 199484, 'Action|Comedy|Crime'), ('Toy Story 2 (1999)', 3114, 'Adventure|Animation|Children|Comedy|Fantasy'), ('Toy Story 3 (2010)', 78499, 'Adventure|Animation|Children|Comedy|Fantasy|IMAX'), ('Christmas Toy, The (1986)', 80141, 'Children|Musical')]


# Load model 

In [8]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch with HTTP (not HTTPS)
es = Elasticsearch(
    ['http://localhost:9200']  # Use HTTP instead of HTTPS
)

# Define the index name
index_name = 'movies_index_large'


In [9]:
def search_results_es(query_text, es, index_name, top_k):
    query = {
        "multi_match": {
            "query": query_text,
            "fields": ["title^2", "genres^0.5"]
        }
    }
    results = es.search(index=index_name, query=query, size=top_k)
    return [(hit['_source'].get('title'), hit['_source'].get('movieId'), hit['_source'].get('genres')) for hit in results['hits']['hits']]


In [10]:
# Example search
query_text = "toy"
search_results = search_results_es(query_text, es, index_name, top_k=10)
print(search_results)

[('Toy, The (1982)', 4929, 'Comedy'), ('Toy Story (1995)', 1, 'Adventure|Animation|Children|Comedy|Fantasy'), ('Toy Soldiers (1991)', 5843, 'Action|Drama'), ('Toy Soldiers (1984)', 139263, 'Action|Drama|Thriller'), ('Toy Masters (2014)', 143537, 'Documentary'), ('Toy Reanimator (2002)', 153234, 'Fantasy|Sci-Fi'), ('Toy Gun (2018)', 199484, 'Action|Comedy|Crime'), ('Toy Story 2 (1999)', 3114, 'Adventure|Animation|Children|Comedy|Fantasy'), ('Toy Story 3 (2010)', 78499, 'Adventure|Animation|Children|Comedy|Fantasy|IMAX'), ('Christmas Toy, The (1986)', 80141, 'Children|Musical')]
