In [1]:
from elasticsearch import Elasticsearch, helpers
from getpass import getpass

#Connect to the elastic cloud server
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

# Create an Elasticsearch client using the provided credentials
es = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,  # cloud id can be found under deployment management
    api_key=ELASTIC_API_KEY, # your username and password for connecting to elastic, found under Deplouments - Security
)

  from elasticsearch.client import MlClient


In [40]:
index_name = "20_news"

mappings= {
    "properties": {
		"description": {
			"type": "text",
			"analyzer":"english",
			"fielddata": True
		},
		"category": {
			"type": "text",
			"analyzer":"english",
			"fields": {
				"keyword": {
					"type": "keyword",
					"ignore_above": 512
				}
			}
		},
		"name": {
			"type": "text",
			"analyzer":"english",
			"fielddata": True
		}
	}
}

In [3]:
data = [
    {
        "name": "Tomato",
        "category": "fruit",
        "description": "The tomato is the edible berry of the plant Solanum lycopersicum, commonly known as the tomato plant. The species originated in western South America, Mexico, and Central America. The Nahuatl word tomatl gave rise to the Spanish word tomate, from which the English word tomato derives. Its domestication and use as a cultivated food may have originated with the indigenous peoples of Mexico"
    },
    {
        "name": "Potato",
        "category": "vegetable",
        "description": "The potato is a starchy root vegetable native to the Americas that is consumed as a staple food in many parts of the world. Potatoes are tubers of the plant Solanum tuberosum, a perennial in the nightshade family Solanaceae."
    },
    {
        "name": "Baguette",
        "category": "bread",
        "description": "A baguette is a long, thin type of bread of French origin that is commonly made from basic lean dough (the dough, not the shape, is defined by French law). It is distinguishable by its length and crisp crust. "
    }
]

In [None]:
from datasets import load_dataset

dataset = load_dataset("SetFit/20_newsgroups",)

In [41]:
es.indices.create(index=index_name)

def generate_docs(data, index_name):
    for element in data:
        element.update({"_index": index_name})
        yield element

load = helpers.bulk(es, generate_docs(dataset["train"], index_name))

In [53]:
response = es.search(index=index_name)
for hit in response["hits"]["hits"]:
    print(hit['_source'])

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.', 'label': 7, 'label_text': 'rec.autos'}
{'text': "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge

In [59]:
example = response["hits"]["hits"][0]["_source"]
print(example["text"])
print(example["label_text"])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
rec.autos


In [60]:
query = {
    "more_like_this":{
        "fields":[
            "text",
            "label_text"
        ],
        "like":example["text"],
        "min_term_freq":1,
        "max_query_terms":20
   }
}

In [61]:
response = es.search(index=index_name, query=query)
for hit in response["hits"]["hits"]:
    print(hit['_source']["label_text"])

rec.autos
rec.autos
rec.autos
misc.forsale
rec.autos
rec.autos
rec.autos
rec.autos
rec.autos
rec.motorcycles


In [62]:
from operator import itemgetter
def get_best_category(response):
    categories = {}
    for hit in response['hits']['hits']:
        score = hit['_score']
        for category in hit['_source']['category']: 
            if category not in categories:
                categories[category] = score
            else:
                categories[category] += score
    if len(categories) > 0:
        sortedCategories = sorted(categories.items(), key=itemgetter(1), reverse=True)
        category = sortedCategories[0][0]
    return category

In [None]:
get_best_category()