In [1]:
import os, sys, json
import numpy as np
import pandas as pd
import re
import ast
from pprint import pprint



def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")


data_dir = "data/geolocations/orgs_names_postcodes.csv"
df = pd.read_csv(data_dir)
# remove nans
df = df.dropna(subset=['postcode'])

Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [2]:
df.head(50)

Unnamed: 0,dataset,id,name,short_name,postcode,city
0,cordis,323260,nuclear decommissioning authority - nda,nda,ca24 3hu,moor row
1,cordis,219438,south west tourism limited,south west tourism,ex2 5wt,exeter
2,cordis,219438,welsh government,welsh government,cf10 3nq,cardiff
3,cordis,226103,terrasalus limited,terrasalus,le15 9el,oakham
4,cordis,282558,ol pharma partners ltd,pharmivation,sg4 7dp,weston herts
5,cordis,601714,forsite diagnostics ltd,forsite diagnostics,yo41 1lz,york
6,cordis,241839,st george's hospital medical school,,sw17 0re,london
7,cordis,201720,genetic alliance uk ltd,gauk,n1 3qp,london
8,cordis,601852,fetal medicine foundation,fetal medicine found,w1g 6bg,london
9,cordis,241873,centre of the cell,centre of the cell,e1 2ab,london


In [3]:
# convert postcode to lowercase
df['postcode'] = df['postcode'].str.lower()

# remove any leading or trailing whitespace
df['postcode'] = df['postcode'].str.strip()

# remove non-alphanumeric characters
df['postcode'] = df['postcode'].str.replace(r'\W+', '')

df.head()

Unnamed: 0,dataset,id,name,short_name,postcode,city
0,cordis,323260,nuclear decommissioning authority - nda,nda,ca24 3hu,moor row
1,cordis,219438,south west tourism limited,south west tourism,ex2 5wt,exeter
2,cordis,219438,welsh government,welsh government,cf10 3nq,cardiff
3,cordis,226103,terrasalus limited,terrasalus,le15 9el,oakham
4,cordis,282558,ol pharma partners ltd,pharmivation,sg4 7dp,weston herts


# Note - Some postcodes have an o instead of a 0 and we aren't catching these. We could add them back in later

In [4]:
from uk_postcodes_parsing import ukpostcode

corpus = "ca24 3hu"
parsed = ukpostcode.parse_from_corpus(corpus)[0]
print(parsed)
print(type(parsed))
print(parsed.__dict__)
# (is_in_ons_postcode_directory=True, fix_distance=0, original='ca24 3hu', postcode='CA24 3HU', incode='3HU', outcode='CA24', area='CA', district='CA24', sub_district=None, sector='CA24 3', unit='HU')

def parse(postcode):
    return ukpostcode.parse_from_corpus(postcode)

df["parsed_postcode"] = df["postcode"].apply(lambda x: parse(x)[0].__dict__ if len(parse(x))>0 else None) 

df.head(50)

Postcode(is_in_ons_postcode_directory=True, fix_distance=0, original='ca24 3hu', postcode='CA24 3HU', incode='3HU', outcode='CA24', area='CA', district='CA24', sub_district=None, sector='CA24 3', unit='HU')
<class 'uk_postcodes_parsing.ukpostcode.Postcode'>
{'original': 'ca24 3hu', 'postcode': 'CA24 3HU', 'incode': '3HU', 'outcode': 'CA24', 'area': 'CA', 'district': 'CA24', 'sub_district': None, 'sector': 'CA24 3', 'unit': 'HU', 'fix_distance': 0, 'is_in_ons_postcode_directory': True}


Unnamed: 0,dataset,id,name,short_name,postcode,city,parsed_postcode
0,cordis,323260,nuclear decommissioning authority - nda,nda,ca24 3hu,moor row,"{'original': 'ca24 3hu', 'postcode': 'CA24 3HU..."
1,cordis,219438,south west tourism limited,south west tourism,ex2 5wt,exeter,"{'original': 'ex2 5wt', 'postcode': 'EX2 5WT',..."
2,cordis,219438,welsh government,welsh government,cf10 3nq,cardiff,"{'original': 'cf10 3nq', 'postcode': 'CF10 3NQ..."
3,cordis,226103,terrasalus limited,terrasalus,le15 9el,oakham,"{'original': 'le15 9el', 'postcode': 'LE15 9EL..."
4,cordis,282558,ol pharma partners ltd,pharmivation,sg4 7dp,weston herts,"{'original': 'sg4 7dp', 'postcode': 'SG4 7DP',..."
5,cordis,601714,forsite diagnostics ltd,forsite diagnostics,yo41 1lz,york,"{'original': 'yo41 1lz', 'postcode': 'YO41 1LZ..."
6,cordis,241839,st george's hospital medical school,,sw17 0re,london,"{'original': 'sw17 0re', 'postcode': 'SW17 0RE..."
7,cordis,201720,genetic alliance uk ltd,gauk,n1 3qp,london,"{'original': 'n1 3qp', 'postcode': 'N1 3QP', '..."
8,cordis,601852,fetal medicine foundation,fetal medicine found,w1g 6bg,london,"{'original': 'w1g 6bg', 'postcode': 'W1G 6BG',..."
9,cordis,241873,centre of the cell,centre of the cell,e1 2ab,london,"{'original': 'e1 2ab', 'postcode': 'E1 2AB', '..."


In [5]:
# find all entries with postcode = "CB2 1TN"
df_cb2 = df[df['postcode'] == "cb2 1tn"]
df_cb2.head(50)

Unnamed: 0,dataset,id,name,short_name,postcode,city,parsed_postcode
8503,cordis,613368,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8506,cordis,234217,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8507,cordis,617391,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8508,cordis,302490,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8509,cordis,238017,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8511,cordis,627433,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8512,cordis,251834,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8513,cordis,322140,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8514,cordis,260872,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."
8517,cordis,624067,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',..."


In [6]:
# remove None values
df = df.dropna(subset=['parsed_postcode'])
df.head(50)

Unnamed: 0,dataset,id,name,short_name,postcode,city,parsed_postcode
0,cordis,323260,nuclear decommissioning authority - nda,nda,ca24 3hu,moor row,"{'original': 'ca24 3hu', 'postcode': 'CA24 3HU..."
1,cordis,219438,south west tourism limited,south west tourism,ex2 5wt,exeter,"{'original': 'ex2 5wt', 'postcode': 'EX2 5WT',..."
2,cordis,219438,welsh government,welsh government,cf10 3nq,cardiff,"{'original': 'cf10 3nq', 'postcode': 'CF10 3NQ..."
3,cordis,226103,terrasalus limited,terrasalus,le15 9el,oakham,"{'original': 'le15 9el', 'postcode': 'LE15 9EL..."
4,cordis,282558,ol pharma partners ltd,pharmivation,sg4 7dp,weston herts,"{'original': 'sg4 7dp', 'postcode': 'SG4 7DP',..."
5,cordis,601714,forsite diagnostics ltd,forsite diagnostics,yo41 1lz,york,"{'original': 'yo41 1lz', 'postcode': 'YO41 1LZ..."
6,cordis,241839,st george's hospital medical school,,sw17 0re,london,"{'original': 'sw17 0re', 'postcode': 'SW17 0RE..."
7,cordis,201720,genetic alliance uk ltd,gauk,n1 3qp,london,"{'original': 'n1 3qp', 'postcode': 'N1 3QP', '..."
8,cordis,601852,fetal medicine foundation,fetal medicine found,w1g 6bg,london,"{'original': 'w1g 6bg', 'postcode': 'W1G 6BG',..."
9,cordis,241873,centre of the cell,centre of the cell,e1 2ab,london,"{'original': 'e1 2ab', 'postcode': 'E1 2AB', '..."


In [7]:
df.tail()

Unnamed: 0,dataset,id,name,short_name,postcode,city,parsed_postcode
106305,gtr,01F2924C-FFB4-481B-B8F0-31234D33F0FA,university of exeter,,ex4 4sb,,"{'original': 'ex4 4sb', 'postcode': 'EX4 4SB',..."
106306,gtr,0433C2E4-BD74-4E19-A10F-04877DD9411B,spearhead marketing limited,,cb8 8jp,,"{'original': 'cb8 8jp', 'postcode': 'CB8 8JP',..."
106307,gtr,0648C484-FB54-4EE8-8AA5-0CCC73A1F54C,bgr training limited,,g41 3qs,,"{'original': 'g41 3qs', 'postcode': 'G41 3QS',..."
106310,gtr,0D4FDCE2-32C8-4FC1-A582-331BEE2B5C1F,disyn biotc,,wa5 3pz,,"{'original': 'wa5 3pz', 'postcode': 'WA5 3PZ',..."
106311,gtr,E15F7794-DE6A-4BF3-814D-075FDAE2E332,x-io technologies limited,,bs4 2js,,"{'original': 'bs4 2js', 'postcode': 'BS4 2JS',..."


In [8]:
# get type of parsed_postcode
print(type(df["parsed_postcode"][0]))

<class 'dict'>


In [9]:
df.tail(10)

Unnamed: 0,dataset,id,name,short_name,postcode,city,parsed_postcode
106297,gtr,DC85C5E8-F95E-48A9-B3E8-23110BF82DF9,ticker limited,,gu8 6bq,,"{'original': 'gu8 6bq', 'postcode': 'GU8 6BQ',..."
106298,gtr,DDBC0327-903E-4CC0-970F-231601349D9E,ekaterra r&d,,mk44 1lq,,"{'original': 'mk44 1lq', 'postcode': 'MK44 1LQ..."
106300,gtr,DE0B2CDD-AF6A-4E4B-83C2-0A70968D934B,hdr uk,,nw1 2be,,"{'original': 'nw1 2be', 'postcode': 'NW1 2BE',..."
106301,gtr,DE2A3BD5-4503-420D-AC8A-1C14E0AC401F,sartorius stedim lab limited,,gl10 3ut,,"{'original': 'gl10 3ut', 'postcode': 'GL10 3UT..."
106303,gtr,E17A6DDD-8D5F-4E58-8AB0-2628D5266F1E,recon waste management ltd,,bt62 1ux,,"{'original': 'bt62 1ux', 'postcode': 'BT62 1UX..."
106305,gtr,01F2924C-FFB4-481B-B8F0-31234D33F0FA,university of exeter,,ex4 4sb,,"{'original': 'ex4 4sb', 'postcode': 'EX4 4SB',..."
106306,gtr,0433C2E4-BD74-4E19-A10F-04877DD9411B,spearhead marketing limited,,cb8 8jp,,"{'original': 'cb8 8jp', 'postcode': 'CB8 8JP',..."
106307,gtr,0648C484-FB54-4EE8-8AA5-0CCC73A1F54C,bgr training limited,,g41 3qs,,"{'original': 'g41 3qs', 'postcode': 'G41 3QS',..."
106310,gtr,0D4FDCE2-32C8-4FC1-A582-331BEE2B5C1F,disyn biotc,,wa5 3pz,,"{'original': 'wa5 3pz', 'postcode': 'WA5 3PZ',..."
106311,gtr,E15F7794-DE6A-4BF3-814D-075FDAE2E332,x-io technologies limited,,bs4 2js,,"{'original': 'bs4 2js', 'postcode': 'BS4 2JS',..."


In [10]:
# rename postcode column to original_postcode
df = df.rename(columns={"postcode": "original_postcode"})
df_normalised = pd.json_normalize(df['parsed_postcode'])
df_combined = pd.concat([df.reset_index(drop=True), df_normalised.reset_index(drop=True)], axis=1)


# save
df_combined.to_csv("data/geolocations/orgs_names_postcodes_cleaned.csv", index=False)

In [11]:
df_combined.tail(50)

Unnamed: 0,dataset,id,name,short_name,original_postcode,city,parsed_postcode,original,postcode,incode,outcode,area,district,sub_district,sector,unit,fix_distance,is_in_ons_postcode_directory
72478,gtr,A623CD38-0CB2-4672-B4CB-70D35D3B0083,disruptieve ltd,,ec2a 4ne,,"{'original': 'ec2a 4ne', 'postcode': 'EC2A 4NE...",ec2a 4ne,EC2A 4NE,4NE,EC2A,EC,EC2,EC2A,EC2A 4,NE,0,True
72479,gtr,A8E654D3-E2CB-4730-8755-639FE48000FD,legrand electric limited,,b19 2lf,,"{'original': 'b19 2lf', 'postcode': 'B19 2LF',...",b19 2lf,B19 2LF,2LF,B19,B,B19,,B19 2,LF,0,True
72480,gtr,ABA2B172-B299-4585-B5D8-601DDE5F620E,quic (uk) limited,,ll13 7gw,,"{'original': 'll13 7gw', 'postcode': 'LL13 7GW...",ll13 7gw,LL13 7GW,7GW,LL13,LL,LL13,,LL13 7,GW,0,True
72481,gtr,ABE0BBA0-6595-4B2E-894B-563D5F8D9F93,grow-wellbeing cic,,ch41 6nd,,"{'original': 'ch41 6nd', 'postcode': 'CH41 6ND...",ch41 6nd,CH41 6ND,6ND,CH41,CH,CH41,,CH41 6,ND,0,True
72482,gtr,ACDFD9C2-230E-4063-B9A7-1660AC7ECB99,sector health ltd,,bn1 1ad,,"{'original': 'bn1 1ad', 'postcode': 'BN1 1AD',...",bn1 1ad,BN1 1AD,1AD,BN1,BN,BN1,,BN1 1,AD,0,True
72483,gtr,ACE761D4-BBA1-45BF-A495-20A7D4273204,parallel wireless uk limited,,gu47 0qa,,"{'original': 'gu47 0qa', 'postcode': 'GU47 0QA...",gu47 0qa,GU47 0QA,0QA,GU47,GU,GU47,,GU47 0,QA,0,True
72484,gtr,B4915CAB-7E45-44A7-9E07-6D48D792D88E,diamond light source limited,,ox11 0de,,"{'original': 'ox11 0de', 'postcode': 'OX11 0DE...",ox11 0de,OX11 0DE,0DE,OX11,OX,OX11,,OX11 0,DE,0,True
72485,gtr,B4DE2E42-A68E-4D90-B9D5-3FBDF00D0447,censis,,g1 1rd,,"{'original': 'g1 1rd', 'postcode': 'G1 1RD', '...",g1 1rd,G1 1RD,1RD,G1,G,G1,,G1 1,RD,0,True
72486,gtr,F8BF467D-E283-482B-A3A3-51B1B23F8C2D,cassels farm limited,,ba12 0qq,,"{'original': 'ba12 0qq', 'postcode': 'BA12 0QQ...",ba12 0qq,BA12 0QQ,0QQ,BA12,BA,BA12,,BA12 0,QQ,0,True
72487,gtr,FE006F58-88C3-40CF-A595-4FCDB3D8A21F,planning democracy,,ky11 3lg,,"{'original': 'ky11 3lg', 'postcode': 'KY11 3LG...",ky11 3lg,KY11 3LG,3LG,KY11,KY,KY11,,KY11 3,LG,0,True


In [12]:
# find all entries with postcode = "CB2 1TN"
df_cb2 = df_combined[df_combined['postcode'] == "CB2 1TN"]
df_cb2.head()

Unnamed: 0,dataset,id,name,short_name,original_postcode,city,parsed_postcode,original,postcode,incode,outcode,area,district,sub_district,sector,unit,fix_distance,is_in_ons_postcode_directory
8386,cordis,613368,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',...",cb2 1tn,CB2 1TN,1TN,CB2,CB,CB2,,CB2 1,TN,0,True
8389,cordis,234217,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',...",cb2 1tn,CB2 1TN,1TN,CB2,CB,CB2,,CB2 1,TN,0,True
8390,cordis,617391,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',...",cb2 1tn,CB2 1TN,1TN,CB2,CB,CB2,,CB2 1,TN,0,True
8391,cordis,302490,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',...",cb2 1tn,CB2 1TN,1TN,CB2,CB,CB2,,CB2 1,TN,0,True
8392,cordis,238017,the chancellor masters and scholars of the uni...,,cb2 1tn,cambridge,"{'original': 'cb2 1tn', 'postcode': 'CB2 1TN',...",cb2 1tn,CB2 1TN,1TN,CB2,CB,CB2,,CB2 1,TN,0,True


# We can now look at the organisations with postcodes that specify sub_distict and those that don't.
Those that don't are likely to be rough approximations of the location of the organisation. We can also match on incode, outcode, area, district and subdistrict. 

We can also see in "is_in_ons_postcode_directory" whether a postcode is no longer in use (will be set to False). Could indicate an org has moved. 

Now we can try matching...
1. Get df_id_postcode = df["id", "postcode"]

Get list of ids for each postcode

In [13]:
df_id_pc = df_combined[["id", "postcode"]].copy()

df_grouped = df_id_pc.groupby("postcode")["id"].apply(list).reset_index(name="org_ids")
df_grouped.head(50)



Unnamed: 0,postcode,org_ids
0,A9 2VW,[6B678B02-3DAB-41F1-BA5E-54B510304C7F]
1,A9 8HF,[F971E743-BECD-4A6A-B547-8E26C6F7C2EB]
2,AA1 1AA,[B439F284-54EB-4933-BA31-3797FBE211CC]
3,AB10 1AB,"[7ED73A50-A205-49E0-A635-2CB09B1555E3, 4BD11E7..."
4,AB10 1BL,[825A0B07-A723-4AFE-BF05-03B79719FA6E]
5,AB10 1DQ,"[673664, 0EB554AB-6EC9-4283-ACF2-28165A3D5CA8]"
6,AB10 1FE,[77944907-3066-483B-A141-EB76BCBB0814]
7,AB10 1FQ,[F8C26AE4-AC08-4020-BA02-111087D5AC4F]
8,AB10 1FR,"[2C2EF413-6B81-4F48-BE30-58F0BF3291F0, D2CEE12..."
9,AB10 1FW,"[196C380E-218E-4E8B-89DA-1B84D05AD536, C1FBF68..."


remove any postcodes with less than 2 ids


In [14]:
# remove any postcodes with less than 2 ids
df_grouped = df_grouped[df_grouped["org_ids"].apply(lambda x: len(x) > 1)]


In [15]:
df_grouped.head()

Unnamed: 0,postcode,org_ids
3,AB10 1AB,"[7ED73A50-A205-49E0-A635-2CB09B1555E3, 4BD11E7..."
5,AB10 1DQ,"[673664, 0EB554AB-6EC9-4283-ACF2-28165A3D5CA8]"
8,AB10 1FR,"[2C2EF413-6B81-4F48-BE30-58F0BF3291F0, D2CEE12..."
9,AB10 1FW,"[196C380E-218E-4E8B-89DA-1B84D05AD536, C1FBF68..."
10,AB10 1FY,"[303467, 278192, 621228, 690713, 671426, 73558..."


In [16]:
# what postcode has the most ids and how many does it have?
max_ids = df_grouped["org_ids"].apply(len).max()
max_ids_postcode = df_grouped[df_grouped["org_ids"].apply(len) == max_ids]
print(max_ids)
print(max_ids_postcode)


1621
     postcode                                            org_ids
3399  CB2 1TN  [613368, 234217, 617391, 302490, 238017, 62743...


# One cambridge postcode has like 1000 ids... not sure why

Next step is to look within those groups and see if there are multiple org names. 

Similarly, groupby org names, and look for different postcodes. 