# TF-IDF Based Search

## I. Simple Example


### Importing Libraries

In [1]:
%reset -f

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

### Prepare corpus

In [2]:
a = 'sie trinkt kaffee und sie liest bücher'
b = 'sie trinkt tee'
c = 'sie kauft bücher'

df = pd.DataFrame({'corpus': [a,b,c]})
df.index.name='id'
df

Unnamed: 0_level_0,corpus
id,Unnamed: 1_level_1
0,sie trinkt kaffee und sie liest bücher
1,sie trinkt tee
2,sie kauft bücher


### Prepare tfidf-matrix

In [3]:
data = df['corpus'].tolist()
model = TfidfVectorizer(norm=None)
model.fit(data)
X = model.transform(data)

dcount = pd.DataFrame(X.toarray(), columns = model.get_feature_names_out())
dcount.index.name='id'
dcount

Unnamed: 0_level_0,bücher,kaffee,kauft,liest,sie,tee,trinkt,und
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.287682,1.693147,0.0,1.693147,2.0,0.0,1.287682,1.693147
1,0.0,0.0,0.0,0.0,1.0,1.693147,1.287682,0.0
2,1.287682,0.0,1.693147,0.0,1.0,0.0,0.0,0.0


### Prepare query

In [4]:
query = ["er trinkt kaffee und tee"]
#query = ["er kauft limonade"]
query

['er trinkt kaffee und tee']

In [5]:
Xq = model.transform(query)
print(type(Xq))

#  columns = model.get_feature_names_out()
pd.DataFrame(columns=dcount.columns, data = Xq.toarray())



<class 'scipy.sparse._csr.csr_matrix'>


Unnamed: 0,bücher,kaffee,kauft,liest,sie,tee,trinkt,und
0,0.0,1.693147,0.0,0.0,0.0,1.693147,1.287682,1.693147


### Search Result

In [6]:
Xa = X.toarray()
Xqa = Xq.toarray()

print('Xa shape: ', Xa.shape)
display(Xa)
print()
print('Xqa shape: ', Xqa.shape)
display(Xqa)

Xa shape:  (3, 8)


array([[1.28768207, 1.69314718, 0.        , 1.69314718, 2.        ,
        0.        , 1.28768207, 1.69314718],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        1.69314718, 1.28768207, 0.        ],
       [1.28768207, 0.        , 1.69314718, 0.        , 1.        ,
        0.        , 0.        , 0.        ]])


Xqa shape:  (1, 8)


array([[0.        , 1.69314718, 0.        , 0.        , 0.        ,
        1.69314718, 1.28768207, 1.69314718]])

In [7]:
result = Xqa @ Xa.T
print('result: ')
result

result: 


array([[7.39161987, 4.52487249, 0.        ]])

In [8]:
df['result'] = result[0]
df

Unnamed: 0_level_0,corpus,result
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,sie trinkt kaffee und sie liest bücher,7.39162
1,sie trinkt tee,4.524872
2,sie kauft bücher,0.0


# II. Simple Search Engine

## Part 1: Prepare Data

### Preparation

In [9]:
%reset -f

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

a = 'sie trinkt kaffee und sie liest bücher'
b = 'sie trinkt tee'
c = 'sie kauft bücher'

df = pd.DataFrame({'corpus': [a,b,c]})
df.index.name='id'
df

Unnamed: 0_level_0,corpus
id,Unnamed: 1_level_1
0,sie trinkt kaffee und sie liest bücher
1,sie trinkt tee
2,sie kauft bücher



### Crawler



In [10]:
%reset -f
import pandas as pd
import wikipediaapi # vers <0.6; otherwise add user agent: https://pypi.org/project/Wikipedia-API/
import re

In [11]:
# crawler data

LANG = 'en' # 'de'
# read data from internet
READDATA = False

In [12]:
if LANG == 'en':
    # repositories
    #fn_crawler_datapq = 'data/corpuspq.gzip'
    fn_corpus = 'data/corpus.csv'
    articles = ['Desk pad','Data Science','Artificial intelligence', 'Somaliland', 
                 'Natural language processing', 'Arabian Sea', 'Suez Canal', 
                 'Statistics','Dependent and independent variables', 'Gulf of Aden', 
                 'Machine Learning','European Central Bank','Bank','Financial technology', 
                 'International Monetary Fund','Basketball','Swimming','Tennis']

elif LANG == 'de':
    # repositories
    #fn_crawler_datapq = 'data/corpuspq_de.gzip'
    fn_corpus = 'data/corpus_de.csv'
    articles_de = ['Schreibunterlage', 'Data Science', 'Künstliche Intelligenz', 'Somaliland', 
               'Verarbeitung natürlicher Sprache', 'Arabisches Meer', 'Suezkanal', 
               'Statistik', 'Abhängige und unabhängige Variable', 'Golf von Aden',
               'Maschinelles Lernen', 'Europäische Zentralbank', 'Bank', 
               'Finanztechnologie', 'Internationaler Währungsfonds', 'Basketball', 
               'Schwimmen beim Menschen', 'Tennis']

wiki = wikipediaapi.Wikipedia(LANG)

In [13]:
def filter_text(text):
    """
    remove non latin characters; remove "\n"
    input: string
    return: cleaned string
    """
    text = re.sub('\n', ' ', text)
    pattern = r'[^\u0020-\u017F]'
    text = re.sub(pattern, '', text)
    return text
    

In [14]:
# crawler

wiki_lst=[]
title_lst=[]

if READDATA:
    for i, article in enumerate(articles):
        print(f'{i}: {article}')
        page = wiki.page(article)
        text = filter_text(page.text)
        wiki_lst.append(text)
        title_lst.append(page.title)

    pd.DataFrame({'article': wiki_lst, 'title': title_lst}).to_csv(fn_corpus,  index = None)  
    #df.to_parquet(crawler_datapq, engine = 'fastparquet', compression='gzip')


### Indexer

In [15]:
%reset -f
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


### Load data

In [16]:
LANG = 'en'

if LANG == 'en':
    # repositories
    #crawler_data = 'data/corpus_raw.csv'
    #fn_crawler_datapq = 'data/corpuspq.gzip'
    fn_corpus = 'data/corpus.csv'

elif LANG == 'de':
    # repositories
    #crawler_data = 'data/corpus_raw.csv'
    #fn_crawler_datapq = 'data/corpuspq_de.gzip'
    fn_corpus = 'data/corpus_de.csv'


In [17]:
corpus = pd.read_csv(fn_corpus)
corpus

Unnamed: 0,article,title
0,A desk pad or blotter is a table protector use...,Desk pad
1,Data science is an interdisciplinary academic ...,Data science
2,Artificial intelligence (AI) is intelligencepe...,Artificial intelligence
3,"Somaliland, officially the Republic of Somalil...",Somaliland
4,Natural language processing (NLP) is an interd...,Natural language processing
5,"The Arabian Sea (Arabic: , romanized: Al-Bahr...",Arabian Sea
6,"The Suez Canal (Egyptian Arabic: , Qanāt el S...",Suez Canal
7,"Statistics (from German: Statistik, orig. ""des...",Statistics
8,Dependent and independent variables are variab...,Dependent and independent variables
9,"The Gulf of Aden (Arabic: , Somali: Gacanka C...",Gulf of Aden


In [18]:
corpus.shape

(18, 2)

### Optional: Tokenizer

In [19]:
import spacy

TOKENIZER = True
FILTER = True

if TOKENIZER: 
    # keeping only tagger component needed for lemmatization
    nlp = spacy.load('en_core_web_sm',  disable=["parser", "ner"])


2023-11-19 19:52:25.589430: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 19:52:26.477256: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-11-19 19:52:26.477392: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-11-19 19:52:27.285882: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-11-19 19:52:27.285918: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN

In [20]:
%%time
# 15 sec for 45'000 words 

if TOKENIZER:
    corpus['lemma'] = corpus['article'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))


CPU times: user 13.5 s, sys: 1.1 s, total: 14.6 s
Wall time: 14.6 s


In [21]:
corpus

Unnamed: 0,article,title,lemma
0,A desk pad or blotter is a table protector use...,Desk pad,a desk pad or blotter be a table protector use...
1,Data science is an interdisciplinary academic ...,Data science,datum science be an interdisciplinary academic...
2,Artificial intelligence (AI) is intelligencepe...,Artificial intelligence,artificial intelligence ( AI ) be intelligence...
3,"Somaliland, officially the Republic of Somalil...",Somaliland,"Somaliland , officially the Republic of Somali..."
4,Natural language processing (NLP) is an interd...,Natural language processing,natural language processing ( NLP ) be an inte...
5,"The Arabian Sea (Arabic: , romanized: Al-Bahr...",Arabian Sea,"the Arabian Sea ( Arabic : , romanize : Al -..."
6,"The Suez Canal (Egyptian Arabic: , Qanāt el S...",Suez Canal,"the Suez Canal ( Egyptian Arabic : , Qanāt e..."
7,"Statistics (from German: Statistik, orig. ""des...",Statistics,"statistic ( from german : Statistik , orig . ""..."
8,Dependent and independent variables are variab...,Dependent and independent variables,dependent and independent variable be variable...
9,"The Gulf of Aden (Arabic: , Somali: Gacanka C...",Gulf of Aden,"the Gulf of Aden ( Arabic : , Somali : Gacan..."


### Optional: Filter

* remove numbers (only numbers, not numbers combined with letters)

Token shorter than 3 letters will be removed by tf-idf method below, including punctuation.

In [22]:
if FILTER and TOKENIZER:
    # remove numbers
    corpus['lemma'] = corpus['lemma'].replace(r'\b[0-9.].*?\b', '', regex = True)#corpus

corpus

Unnamed: 0,article,title,lemma
0,A desk pad or blotter is a table protector use...,Desk pad,a desk pad or blotter be a table protector use...
1,Data science is an interdisciplinary academic ...,Data science,datum science be an interdisciplinary academic...
2,Artificial intelligence (AI) is intelligencepe...,Artificial intelligence,artificial intelligence ( AI ) be intelligence...
3,"Somaliland, officially the Republic of Somalil...",Somaliland,"Somaliland , officially the Republic of Somali..."
4,Natural language processing (NLP) is an interd...,Natural language processing,natural language processing ( NLP ) be an inte...
5,"The Arabian Sea (Arabic: , romanized: Al-Bahr...",Arabian Sea,"the Arabian Sea ( Arabic : , romanize : Al -..."
6,"The Suez Canal (Egyptian Arabic: , Qanāt el S...",Suez Canal,"the Suez Canal ( Egyptian Arabic : , Qanāt e..."
7,"Statistics (from German: Statistik, orig. ""des...",Statistics,"statistic ( from german : Statistik , orig . ""..."
8,Dependent and independent variables are variab...,Dependent and independent variables,dependent and independent variable be variable...
9,"The Gulf of Aden (Arabic: , Somali: Gacanka C...",Gulf of Aden,"the Gulf of Aden ( Arabic : , Somali : Gacan..."


In [23]:
# count number of words (by counting spaces)
corpus['article'].str.count(' ').sum()

106806


### Create Index

In [24]:
%%time

# cutting tfidf: the larger the more common and stop words
SPECF = 0.9

if TOKENIZER:
    docs= corpus['lemma'].tolist()
else:
    docs= corpus['article'].tolist()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer(max_df = SPECF) # , norm=None)
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.shape)
df.tail()
df.iloc[3000:3020,0:5]
df

(9041, 18)
CPU times: user 159 ms, sys: 0 ns, total: 159 ms
Wall time: 159 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
a2,0.067623,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
a3,0.067623,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
aaai,0.000000,0.0,0.005345,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.005262,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
aateye,0.000000,0.0,0.000000,0.003877,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
aau,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.003764,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
études,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.004016,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
être,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.004117,0.0,0.0,0.0,0.000000,0.0,0.0
österreichischer,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.004016,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
čapek,0.000000,0.0,0.012214,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0


### Save Index data

* Save model data
* Save tf-idf matrix


In [25]:
import joblib

SAVEINDEX = False

if LANG == 'en':
    # model
    fn_mod = "data/simple_search.pkl"
    # tfidf matrix
    fn_dfidf = "data/simple_search.csv"

elif LANG == 'de':
    # model
    fn_mod = "data/simple_search_de.pkl"
    # tfidf matrix
    fn_dfidf = "data/simple_search_de.csv"

# document addresses
#fn_corpus = "data/simle_search_corp.csv"

print(f'Parameter: LANG: {LANG}, SPECF: {SPECF}, TOKENIZER: {TOKENIZER}, FILTER: {FILTER}, tfidf-size: {df.shape}')

if SAVEINDEX:
    joblib.dump(vectorizer, fn_mod) 
    df.to_csv(fn_dfidf)
    print(f'Saved: {fn_mod} and {fn_dfidf}')
else:
    print('(not saved)')
        
#corpusDst[["title"]].to_csv(fn_corpus, index=None)


Parameter: LANG: en, SPECF: 0.9, TOKENIZER: True, FILTER: True, tfidf-size: (9041, 18)
(not saved)


## Part 2: Query data

### Load Index

In [26]:
%reset -f
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib


In [27]:
LANG = 'en'

Load model data and tfidf-matrix

In [28]:
# load trained model, ie ifidf matrix

if LANG == 'en':
    # model
    fn_mod = "data/simple_search.pkl"
    # tfidf matrix
    fn_dfidf = "data/simple_search.csv"
    # corpus; required if not only document number is needed but title and text as well
    fn_corpus = 'data/corpus.csv'
    
elif LANG == 'de':
    # model
    fn_mod = "data/simple_search_de.pkl"
    # tfidf matrix
    fn_dfidf = "data/simple_search_de.csv"
    # corpus
    fn_corpus = 'data/corpus_de.csv'

vectorizer = joblib.load(fn_mod)
tfidf = pd.read_csv(fn_dfidf, index_col=0)
corpus = pd.read_csv(fn_corpus)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
00,0.0,0.0,0.000000,0.003040,0.0,0.000000,0.003125,0.00000,0.0,0.000000,0.000000,0.003237,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
000,0.0,0.0,0.005157,0.029503,0.0,0.018911,0.023593,0.00000,0.0,0.009997,0.005098,0.001745,0.014489,0.007198,0.003625,0.000000,0.006556,0.013736
0001,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.004272,0.000000,0.000000,0.000000
0003,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00727,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
001,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.004272,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
études,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.003972,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
être,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.004114,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
österreichischer,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.003972,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
čapek,0.0,0.0,0.012154,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


### Query

In [30]:
# raw demo
#q = ['djibouti waterway']
#q = ['has djibouti a waterway']
q = ['sag mir alles, was du über den waterway in djibouti weisst']

q_vec = vectorizer.transform(q)
search_results = (q_vec @ tfidf)[0]
si = search_results.argsort().tolist()[::-1]
list(zip(si, search_results[si]))

[(9, 0.08644678929441071),
 (6, 0.0197867104200396),
 (3, 0.011023250932349729),
 (7, 0.0038684275756352485),
 (17, 0.0),
 (1, 0.0),
 (2, 0.0),
 (4, 0.0),
 (5, 0.0),
 (8, 0.0),
 (16, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (0, 0.0)]

In [31]:
def get_similar_articles(q, df, vec, verbose=False): 
    """
    Find documents which fit to query. Higher ranking when more token fit to query ("OR"-query)
    
    input:
    q: query string
    df: tfidf matrix
    vec: tfidf transformer
    output:
    (document index, weight)-vector
    
    convert query q into tfidf-vector; calculate query-vector @ tfidf-matrix    
    """
    q = [' '.join(q)]
    if verbose:   print('q OR: ', q)
    q_vec = vec.transform(q) # .toarray().reshape(df.shape[0],)
    search_results = (q_vec @ df)[0]
    si = search_results.argsort().tolist()[::-1]
    return list(zip(si, search_results[si]))

def get_similar_articles_and(q_and, df, vec, verbose = False): 
    """
    Find documents which answer query. 
    All token must be present in result documents ("AND"-query). No result if query contains common or "stop" words.
    
    input:
    q: query string
    df: tfidf matrix
    vec: tfidf transformer
    output:
    (document index, weight)-vector
    
    Convert query q into tfidf-vector; calculate query-vector @ tfidf-matrix.
    Handle each element of querystring separately and multiply the results.
    Documents missing a query element has weight of null, same with product of results.
    """

    search_results = np.ones(df.shape[1]).reshape(-1,)
    #q_list = q_and.split()
    q_list = q_and
    for qi in q_list:
        q = [qi]
        if verbose:  print('qi AND: ', q)
        q_vec = vec.transform(q) # .toarray().reshape(df.shape[0],)
        resv = (q_vec @ df)[0]
        resv_norm = np.linalg.norm(resv)
        if resv_norm < 1e-5: 
            resv_norm = 1e10 
        search_results *= resv /resv_norm
    si = search_results.argsort().tolist()[::-1]
    return list(zip(si, search_results[si]))



def process_result(sim_sorted):
    """
    nice output of results
    input: dictionary{document_index: similarity}
        print(index, similarity, document title)
    """
    global corpus

    for k, v in sim_sorted:
        if v != 0.0:
            print(f"Weight: {v:.5f} in {k:2d}:  {corpus.iloc[k,1]}")
    return None
            
def answer_question(q):
    """
    process query string and print answer
    input: query string
    output: none
    print answer via function process_results
    """
    q_list = q.lower().split()
    searchlist=[]
    for qi in q_list:
        if '*' in qi: 
            qiq = qi.replace('*', '.*')
            searchlist.append(' '.join(list(tfidf[tfidf.index.str.match(fr'{qiq}') == True].index)))
        else:
            #print(qi)
            searchlist.append(qi)
    return searchlist

def demo_query(q, verbose = False):
    """
    Print statistical data for query q

    input: query string
    output: None
    """

    print("Query: ", q)

    q_new = answer_question(q)
    if verbose: print(q_new)
    print('AND')
    process_result(get_similar_articles_and(q_new, tfidf, vectorizer, verbose))
    print('OR')
    process_result(get_similar_articles(q_new, tfidf, vectorizer, verbose))
    print()
    

In [32]:
querylist = [ 'Djibouti Erythraean',  ' djibouti waterway', ' djibouti water*', 
             ' Har*sa Soma*', 'djibo* eryth* ',
             'tell me more about Somalia']

for q in querylist:
    demo_query(q, False)

print('*'*30)
print('verbose: True:')
for q in querylist:
    demo_query(q, True)

Query:  Djibouti Erythraean
AND
Weight: 0.74604 in  9:  Gulf of Aden
OR
Weight: 0.11667 in  9:  Gulf of Aden
Weight: 0.03678 in  5:  Arabian Sea
Weight: 0.01674 in  3:  Somaliland

Query:   djibouti waterway
AND
Weight: 0.93483 in  9:  Gulf of Aden
OR
Weight: 0.13125 in  9:  Gulf of Aden
Weight: 0.01721 in  6:  Suez Canal
Weight: 0.01674 in  3:  Somaliland

Query:   djibouti water*
AND
Weight: 0.37546 in  9:  Gulf of Aden
Weight: 0.01494 in  3:  Somaliland
OR
Weight: 0.09124 in 16:  Swimming
Weight: 0.07327 in  9:  Gulf of Aden
Weight: 0.01730 in  6:  Suez Canal
Weight: 0.01462 in  3:  Somaliland
Weight: 0.00987 in  5:  Arabian Sea

Query:   Har*sa Soma*
AND
Weight: 0.99510 in  3:  Somaliland
OR
Weight: 0.39443 in  3:  Somaliland
Weight: 0.03402 in  9:  Gulf of Aden
Weight: 0.00810 in  5:  Arabian Sea
Weight: 0.00072 in  6:  Suez Canal

Query:  djibo* eryth* 
AND
Weight: 0.82931 in  9:  Gulf of Aden
OR
Weight: 0.07890 in  9:  Gulf of Aden
Weight: 0.02139 in  5:  Arabian Sea
Weight: 0.0