In [43]:
import pandas as pd
import requests
from tqdm import tqdm
import re
pd.set_option('display.max_columns', 100)

### get data

In [67]:
df = pd.read_csv('../DATA/similar_company_finder/company_house_data_part_1_7.csv')
df = df.rename(columns={'CompanyName': 'name', 'SICCode.SicText_1': 'sector_long'})
df = df[['name', 'sector_long']]
df = df[df['sector_long'] != 'None Supplied']  # remove samples with no sector provided

  df = pd.read_csv('../DATA/similar_company_finder/company_house_data_part_1_7.csv')


In [71]:
df['name'] = df['name'].apply(lambda x: x.lower())
df['name'] = df['name'].apply(lambda x: re.sub('ltd|limited', '', x))  # remove ltd or limited
df['name'] = df['name'].apply(lambda x: re.sub('[!?]', '', x))  # remove all ! and ? characters
df['name'] = df['name'].apply(lambda x: re.sub('\A\s+', '', x))  # remove all leading whitespace
df['name'] = df['name'].apply(lambda x: re.sub('\s+\Z', '', x))  # remove trailing whitespace
df = df[df['name'] != '']  # remove all rows whit ambigious 'ltd' company name - found manually (note ltd was already removed)
df['name'].head()

0           heal ur tech
3    big impact graphics
4                goberub
5               nfogenie
6                  nnov8
Name: name, dtype: object

In [68]:
df.sample(2)

Unnamed: 0,name,sector_long
529071,BARK 'N' RENDER LTD,56290 - Other food services
556971,BEA HERBERT LIMITED,"74909 - Other professional, scientific and tec..."


# Extract Sector

In [69]:
df['sector_id'] = df['sector_long'].apply(lambda x: x.split(' - ')[0]).astype(int)
df['sector'] = df['sector_long'].apply(lambda x: x.split(' - ')[1])
df = df.drop(labels=['sector_long'], axis=1)

In [72]:
df

Unnamed: 0,name,sector_id,sector
0,heal ur tech,33140,Repair of electrical equipment
3,big impact graphics,18129,Printing n.e.c.
4,goberub,62020,Information technology consultancy activities
5,nfogenie,58290,Other software publishing
6,nnov8,62090,Other information technology service activities
...,...,...,...
849994,candour consultancy,62090,Other information technology service activities
849995,candour consultants,96090,Other service activities n.e.c.
849996,candour consulting,70229,Management consultancy activities other than f...
849997,candour corporate finance,64999,Financial intermediation not elsewhere classified


# Fetch URL name

### test api for getting URLS from names

In [45]:
company_name = "standard chartered" 
base_url = "https://autocomplete.clearbit.com"
query_url = "/v1/companies/suggest?query="
facts = f"\"{company_name}\""
first_response = requests.get(base_url+query_url+facts)
response_list=first_response.json()
response_list

[{'name': 'Standard Chartered',
  'domain': 'sc.com',
  'logo': 'https://logo.clearbit.com/sc.com'},
 {'name': 'Standard Chartered Singapore Marathon',
  'domain': 'singaporemarathon.com',
  'logo': 'https://logo.clearbit.com/singaporemarathon.com'},
 {'name': 'Standard Chartered',
  'domain': 'ssesdurham.org',
  'logo': 'https://logo.clearbit.com/ssesdurham.org'}]

In [38]:
re.sub('aa|bb', '', 'aa bbb ccc bb')

' b ccc '

### get URLS

In [47]:
company_to_url = {}
for name in tqdm(df['CompanyName'][:500]):
    response = requests.get(base_url+query_url+ f"\"{name}\"" )
    info_list = response.json()
    if len(info_list) > 0:
        info = info_list[0]
        company_to_url[name] = info['domain']

  for name in tqdm(df['CompanyName'][:500]):
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:01<00:00,  2.76it/s]


### Disadvantages of this solution:
1. There are a lot of false positives - the website we find may not match the company
2. the process is slow. Quering 500 company names took 3 minutes

This is good for now to create a temporary solution to develop other parts of the project. But this needs to be explored in more detail.

In [48]:
company_to_url

{'nnov8': 'nnov.ru',
 'nspired': 'nspiredbusinesssolutions.com',
 'obac': 'obacafe.eu',
 't drop': 'tdropmobile.com',
 '" bora " 2': 'boras.se',
 '"1 c o "': 'costco.com',
 '"2 ecoute"': 'ecouterradioenligne.com',
 '"20-20 voice" cancer': '2020voicecancer.org',
 '"a" ceramics': 'a-ceramics.co.uk',
 '"a" concept': 'aconceptproject.com',
 '"and breathe"': 'andbreathewellbeing.com',
 '"avicenna-med"': 'avicennamed.com.ua',
 '"b" cool': 'bcool.as',
 '"basi pilates & movement by yvette "': 'basipilates.it',
 '"bigmac "': 'bigmachineparts.com',
 '"cartref ni"': 'cartrefni.com',
 '"cook with me"': 'cookwithmeg.com',
 '"d&c transport and logistic': 'discord.com',
 '"el al" israel airlines': 'elal.com',
 '"i&gm company"': 'indeed.com',
 '"jm"clean': 'cleannwa.com',
 '"k" line (europe)': 'kline-europe.com',
 '"little world"': 'littleworldofsatoshi.com',
 '"m&e gb "': 'msn.com',
 '"m-shine" cleaning service': 'mshinecleaningservices.com',
 '"marcin & co "': 'marciniwuc.com',
 '"mothers\' union"':