# Query optimization by using pages that link to the main webpage

Example:
- Search for "kevala.care" -site:kevala.care (this is only relevant results)
- Search for Kevala -"kevala.care" -site:kevala.care (this may have irrelevant stuff)
- Fit a basic classifier to discriminate, rip out some of the strongest negative keywords


Notes:
- This might work for companies which have a near-match (Kevala, Akasa) but would not work for companies that don't have confounders (Singularity 6)

In [1]:
from core import Seed, init

init()

In [2]:
seed = Seed.init("Kevala", domain="kevala.care")



In [18]:
from google_search import search

num_results = 80

positive_results = list(search(f'"{seed.domain}" -site:{seed.domain}', num=num_results))
negative_results = list(search(f'{seed.company} -"{seed.domain}" -site:{seed.domain}', num=num_results))

positive_results, negative_results

[32m2024-09-25 13:19:08.926[0m | [34m[1mDEBUG   [0m | [36mgoogle_search[0m:[36msearch[0m:[36m58[0m - [34m[1mGoogle search results: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - "kevala.care" -site:kevala.care', 'totalResults': '73', '

([SearchResult(title='Kevala | LinkedIn', link='https://www.linkedin.com/company/kevala-care', snippet='Mar 14, 2022 ... For more information, visit www.kevala.care. Industry: Software Development. Company size: 11-50 employees. Headquarters: Seattle,\xa0...', formattedUrl='https://www.linkedin.com/company/kevala-care'),
  SearchResult(title='Working at Kevala Care | Glassdoor', link='https://www.glassdoor.com/Overview/Working-at-Kevala-Care-EI_IE5145912.11,22.htm', snippet="See what employees say it's like to work at Kevala Care. Salaries, reviews, and more - all posted by employees working at Kevala Care.", formattedUrl='https://www.glassdoor.com/.../Working-at-Kevala-Care-EI_IE5145912.11,...'),
  SearchResult(title='Kevala', link='https://www.facebook.com/KevalaCare/', snippet='... and patient care. KEVALA.CARE. Kevala. From managing patient care to supporting providers... \U000f0925 · \U000f0926 · \U000f0927. Kevala profile picture. Kevala. Aug 7, 2023\U000f078b\U000f17e0.', format

In [28]:
# create a dataset from this
import pandas as pd

data = []
for result in positive_results:
    data.append({"text": f"{result.title}", "label": "relevant"})
for result in negative_results:
    data.append({"text": f"{result.title}", "label": "irrelevant"})

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Kevala | LinkedIn,relevant
1,Working at Kevala Care | Glassdoor,relevant
2,Kevala,relevant
3,Kevala - Crunchbase Company Profile & Funding,relevant
4,Working at Kevala: Employee Reviews | Indeed.com,relevant
...,...,...
100,Multi-State Transportation Electrification Imp...,irrelevant
101,Cerebral hemodynamic correlates of executive f...,irrelevant
102,"Haveli Bistro's Lunch Special Tickets, Fri, 21...",irrelevant
103,http://scholar.google.com/scholar_lookup?&titl...,irrelevant


In [29]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV

model = make_pipeline(TfidfVectorizer(min_df=2, max_df=0.3, ngram_range=(1, 1), stop_words="english"), LogisticRegressionCV())
model.fit(df.text, df.label)

In [30]:
model.classes_

array(['irrelevant', 'relevant'], dtype=object)

In [31]:
word_weights = pd.Series(model.steps[1][1].coef_[0], index=model.steps[0][1].get_feature_names_out())

word_weights.sort_values().head(20)

health           -0.000085
ceramics         -0.000074
organic          -0.000072
lb               -0.000068
sesame           -0.000063
retreat          -0.000063
iherb            -0.000063
oil              -0.000061
com              -0.000056
oz               -0.000055
grid             -0.000055
transportation   -0.000054
studio           -0.000054
wellness         -0.000053
home             -0.000050
amadea           -0.000047
butter           -0.000046
review           -0.000046
lacey            -0.000041
llc              -0.000041
dtype: float64

In [32]:
word_weights.sort_values().tail(20)

owens          0.000030
working        0.000032
wilson         0.000034
profile        0.000039
12             0.000048
funding        0.000052
series         0.000053
number         0.000070
care           0.000075
staffing       0.000076
employees      0.000076
competitors    0.000076
healthcare     0.000076
software       0.000080
senior         0.000082
vice           0.000083
agency         0.000097
hiring         0.000109
company        0.000117
linkedin       0.000143
dtype: float64

In [9]:
# Try doing the news search with the related: operator

related_results = list(search(f"{seed.company} related:{seed.domain}", num=num_results))
related_results

[32m2024-09-25 13:11:13.213[0m | [34m[1mDEBUG   [0m | [36mgoogle_search[0m:[36msearch[0m:[36m58[0m - [34m[1mGoogle search results: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - Kevala related:kevala.care', 'totalResults': '61400', 'se

[SearchResult(title='Terms of Service', link='https://www.kevala.care/terms-of-service', snippet='Aug 2, 2022 ... The Kevala Platform and its associated software are of U.S. origin. ... Join the Kevala Care NetworkReferral ProgramContract Opportunities.', formattedUrl='https://www.kevala.care/terms-of-service'),
 SearchResult(title='Kevala | LinkedIn', link='https://www.linkedin.com/company/kevala-care', snippet='Mar 14, 2022 ... Kevala is reshaping the way healthcare organizations manage and engage their internal and external care teams. Our adaptive scheduling\xa0...', formattedUrl='https://www.linkedin.com/company/kevala-care'),
 SearchResult(title='Agency Terms of Service', link='https://www.kevala.care/agency-terms-of-service', snippet='Sep 1, 2022 ... The Kevala Platform and its associated software are of U.S. origin. ... Join the Kevala Care NetworkReferral ProgramContract Opportunities.', formattedUrl='https://www.kevala.care/agency-terms-of-service'),
 SearchResult(title='Todd

In [38]:
distractor_terms = word_weights.sort_values().head(5).index
query_refinement = " ".join(f"-intitle:{term}" for term in distractor_terms)

results = list(search(f'"Kevala" "{seed.domain}" news -site:{seed.domain}', num=20))
results

[32m2024-09-25 13:27:34.034[0m | [34m[1mDEBUG   [0m | [36mgoogle_search[0m:[36msearch[0m:[36m58[0m - [34m[1mGoogle search results: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - "Kevala" "kevala.care" news -site:kevala.care', 'totalRes

[SearchResult(title='Kevala Announces Release of Next Generation Healthcare ...', link='https://www.businesswire.com/news/home/20231004282387/en/Kevala-Announces-Release-of-Next-Generation-Healthcare-Scheduling-Solution', snippet='Oct 4, 2023 ... For more information, visit www.kevala.care. Contacts. Kalley Anderson Kevala Technologies, Inc. press@kevala.care\xa0...', formattedUrl='https://www.businesswire.com/news/.../Kevala-Announces-Release-of-Next-...'),
 SearchResult(title='Kevala | LinkedIn', link='https://www.linkedin.com/company/kevala-care', snippet='Mar 14, 2022 ... For more information, visit www.kevala.care. Industry: Software Development. Company size: 11-50 employees. Headquarters: Seattle,\xa0...', formattedUrl='https://www.linkedin.com/company/kevala-care'),
 SearchResult(title='High Alpha Invests in Kevala | High Alpha', link='https://www.highalpha.com/news/high-alpha-invests-in-kevala', snippet='Jan 26, 2021 ... kevala.care. About Costanoa Ventures. Costanoa Ventures 

# Notes

- Generating minus operators didn't help much and it's fiddly
    - Using title plus snippet seemed best
    - Removing a small number of distractors seemed best
- For Kevala, the best option was to search {seed.company} "{seed.domain}". That will filter out some news websites though