# Query optimization by using pages that link to the main webpage

Example:
- Search for "kevala.care" -site:kevala.care (this is only relevant results)
- Search for Kevala -"kevala.care" -site:kevala.care (this may have irrelevant stuff)
- Fit a basic classifier to discriminate, rip out some of the strongest negative keywords


Notes:
- This might work for companies which have a near-match (Kevala, Akasa) but would not work for companies that don't have confounders (Singularity 6)

In [2]:
from core import Seed, init

init()

In [50]:
seed = Seed.init("Singularity 6", domain="singularity6.com")



In [51]:
from google_search import search

num_results = 40

positive_results = list(search(f'"{seed.domain}" -site:{seed.domain}', num=num_results))
negative_results = list(search(f'{seed.company} -"{seed.domain}" -site:{seed.domain}', num=num_results))

positive_results, negative_results

[32m2024-09-20 20:16:04.098[0m | [34m[1mDEBUG   [0m | [36mgoogle_search[0m:[36msearch[0m:[36m58[0m - [34m[1mGoogle search results: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, 'queries': {'request': [{'title': 'Google Custom Search - "singularity6.com" -site:singularity6.com', 'totalResults

([SearchResult(title='Singularity 6 (@Singularity6) / X', link='https://twitter.com/singularity6', snippet='@DaybreakGames. to continue growing Palia! Read more here: Singularity 6, Daybreak Game Company Cozy Up in Studio Acquisition · From singularity6.com. 7. 25. 54.', formattedUrl='https://twitter.com/singularity6'),
  SearchResult(title='Singularity 6 | LinkedIn', link='https://www.linkedin.com/company/singularity6', snippet='About us ; Website: http://www.singularity6.com. External link for Singularity 6 ; Industry: Computer Games ; Company size: 51-200 employees ; Headquarters: Los\xa0...', formattedUrl='https://www.linkedin.com/company/singularity6'),
  SearchResult(title='Daybreak Acquires Singularity 6 Announcement | Daybreak Game ...', link='https://www.daybreakgames.com/news/dbg-s6-acquisition-announcement', snippet='Jul 1, 2024 ... For more information, please visit https://www.singularity6.com/. For press inquiries: Yu Sian Tan / Javi Carlos press@singularity6.com.', forma

In [52]:
# create a dataset from this
import pandas as pd

data = []
for result in positive_results:
    data.append({"text": f"{result.title} {result.snippet}", "label": "relevant"})
for result in negative_results:
    data.append({"text": f"{result.title} {result.snippet}", "label": "irrelevant"})

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Singularity 6 (@Singularity6) / X @DaybreakGam...,relevant
1,Singularity 6 | LinkedIn About us ; Website: h...,relevant
2,Daybreak Acquires Singularity 6 Announcement |...,relevant
3,List of 137 Singularity 6 Employees - Find Ema...,relevant
4,"Ramping Up with Some Tech Tests May 18, 2023 ....",relevant
...,...,...
75,Singularity 6 Closes $30M Series B for Highly ...,irrelevant
76,Singularity 6 Lays Off 35% of Staff After Pali...,irrelevant
77,"Singularity 6: Revenue, Competitors, Alternati...",irrelevant
78,Sixth Singularity - Divine Realm of the Round ...,irrelevant


In [53]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV

model = make_pipeline(TfidfVectorizer(min_df=2, max_df=0.3, ngram_range=(1, 1), stop_words="english"), LogisticRegressionCV())
model.fit(df.text, df.label)

In [54]:
word_weights = pd.Series(model.steps[1][1].coef_[0], index=model.steps[0][1].get_feature_names_out())

word_weights.sort_values().head(20)

developer     -1.477504
corporation   -1.093277
trademarks    -0.923038
million       -0.862997
world         -0.859945
daybreak      -0.788374
layoffs       -0.763948
angeles       -0.759056
launch        -0.756790
30m           -0.749721
laying        -0.728910
based         -0.717521
chief         -0.686394
team          -0.685202
rating        -0.678967
round         -0.650323
staff         -0.619462
online        -0.614910
education     -0.603885
games         -0.593407
dtype: float64

In [55]:
word_weights.sort_values().tail(20)

51              0.713455
media           0.757299
website         0.811865
contact         0.812183
joined          0.815985
architecture    0.824827
welcome         0.843572
linkedin        0.846435
visit           0.850462
240             0.851629
https           0.853990
overview        0.880568
company         0.971643
cozy            0.976630
information     1.018669
http            1.043439
news            1.053787
software        1.186085
email           1.604606
www             2.392220
dtype: float64