# Query optimization by using pages that link to the main webpage

Example:
- Search for "kevala.care" -site:kevala.care (this is only relevant results)
- Search for Kevala -"kevala.care" -site:kevala.care (this may have irrelevant stuff)
- Fit a basic classifier to discriminate, rip out some of the strongest negative keywords


Notes:
- This might work for companies which have a near-match (Kevala, Akasa) but would not work for companies that don't have confounders (Singularity 6)

In [1]:
from core import Seed, init

init()

In [2]:
seed = Seed.init("Kevala", domain="kevala.care")



In [None]:
from utils.google_search import search

num_results = 80

positive_results = list(search(f'"{seed.domain}" -site:{seed.domain}', num=num_results))
negative_results = list(search(f'{seed.company} -"{seed.domain}" -site:{seed.domain}', num=num_results))

positive_results, negative_results

In [None]:
# create a dataset from this
import pandas as pd

data = []
for result in positive_results:
    data.append({"text": f"{result.title} {result.snippet}", "label": "relevant"})
for result in negative_results:
    data.append({"text": f"{result.title} {result.snippet}", "label": "irrelevant"})

df = pd.DataFrame(data)
df

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV

model = make_pipeline(TfidfVectorizer(min_df=2, max_df=0.3, ngram_range=(1, 1), stop_words="english"), LogisticRegressionCV())
model.fit(df.text, df.label)

In [None]:
model.classes_

In [None]:
word_weights = pd.Series(model.steps[1][1].coef_[0], index=model.steps[0][1].get_feature_names_out())

word_weights.sort_values().head(20)

In [None]:
word_weights.sort_values().tail(20)

In [None]:
# Try doing the news search with the related: operator

related_results = list(search(f"{seed.company} related:{seed.domain}", num=num_results))
related_results

In [None]:
distractor_terms = word_weights.sort_values().head(5).index
query_refinement = " ".join(f"-intitle:{term}" for term in distractor_terms)

results = list(search(f'"Kevala" "{seed.domain}" news -site:{seed.domain}', num=20))
results

# Notes

- Generating minus operators didn't help much and it's fiddly
    - Using title plus snippet seemed best
    - Removing a small number of distractors seemed best
- For Kevala, the best option was to search {seed.company} "{seed.domain}". That will filter out some news websites though