In [3]:
import pandas as pd
from fuzzywuzzy import process, fuzz



In [4]:
df = pd.read_json('diversityorgs.tech.json').fillna('')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 17 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   name                                     494 non-null    object
 1   url                                      494 non-null    object
 2   links                                    494 non-null    object
 3   technology_focus                         494 non-null    object
 4   organization_logo                        494 non-null    object
 5   id                                       494 non-null    object
 6   diversity_focus                          494 non-null    object
 7   city                                     494 non-null    object
 8   parent_organization                      494 non-null    object
 9   global_org_url_from_parent_organization  494 non-null    object
 10  twitter                                  494 non-null    objec

In [6]:
for col in df[['name','city']]:
    df[col] = df[col].str.strip()
    print('Number of unique values in ' + str(col) +': ' + str(df[col].nunique()))

Number of unique values in name: 494
Number of unique values in city: 367


In [7]:
# This shows where duplicates may exist
def replacements():
    """generate a list of unique values group them based on similarity"""
    unique_city = df['city'].unique().tolist()
    score_sort = [(x,) + i
                 for x in unique_city 
                 for i in process.extract(x, unique_city, scorer=fuzz.token_sort_ratio)]
    #Create a dataframe from the tuples
    similarity_sort = pd.DataFrame(score_sort, columns=['city_sort','match_sort','score_sort'])
    return similarity_sort[similarity_sort['score_sort'].between(80, 99)]

In [8]:
groups = replacements().groupby(by=['match_sort'])
groups.count()[groups.count()['city_sort'] >= 4]



Unnamed: 0_level_0,city_sort,score_sort
match_sort,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington D.C.,4,4
"Washington, D.C., US",4,4
"Washington, US",4,4


In [10]:
# Get the List and Check Against a City
unique_list = replacements()
unique_list[unique_list['match_sort'].str.contains('Washington')]



Unnamed: 0,city_sort,match_sort,score_sort
126,"Washington, DC",Washington D.C.,89
127,"Washington, DC","Washington, US",85
128,"Washington, DC","Washington, D.C., US",80
961,"Washington, US","Washington, D.C., US",87
962,"Washington, US","Washington, DC",85
963,"Washington, US","Washington D.C., USA",84
964,"Washington, US",Washington D.C.,81
1176,Washington D.C.,"Washington, D.C., US",90
1177,Washington D.C.,"Washington, DC",89
1178,Washington D.C.,"Washington D.C., USA",88


In [None]:
def unify(df, match_series, city_sort_value):
    """
    given a dataframe(df) and match_series,
    set all values of the dataframe to the city_sort_value
    """
    matcher = match_series[match_series['match_sort'] == city_sort_value]['city_sort']
    df.loc[df['city'].isin(matcher), ['city']] = city_sort_value
    return df[df['city'] == city_sort_value]

In [12]:
# Choose a value to match against and change all found values to that assigned value
unify(df, unique_list, 'Washington, DC')

Unnamed: 0,name,url,links,technology_focus,organization_logo,id,diversity_focus,city,parent_organization,global_org_url_from_parent_organization,twitter,github,website,meetup,region,notes,donate
31,PyLadies Washington,https://pyladies.com/locations/dc,"[https://www.meetup.com/dc-pyladies/, https://...",[Python],https://pyladies.com/assets/images/pyladies_dc...,a4f6c0f5ce522ef6eb339699c006fd9236b0370f,[Women],"Washington, DC",PyLadies,https://pyladies.com,,,dc,,,,
172,GirlDevelopIt DC,http://www.meetup.com/Girl-Develop-It-DC/,"[http://www.meetup.com/Girl-Develop-It-DC/, ht...",[Youth Education],https://pbs.twimg.com/profile_images/123317612...,df8806b63b5411b3db2059813b680e2cbb464e05,[Women],"Washington, DC",Girl Develop It!,https://girldevelopit.com,,,,,AMER-East,,
173,Out in Tech Washington DC,https://outintech.com,,[General Technology],https://kjaymiller.s3-us-west-2.amazonaws.com/...,8a186e5e412b53daae7498a4fbea25f8a9a8d1a4,[LGBTQIA+],"Washington, DC",Out in Tech,http://outintech.com/subscribe,,,,,AMER-East,,
192,WISP DC,https://www.wisporg.com/pagedc,,[Cyber Security],https://kjaymiller.s3-us-west-2.amazonaws.com/...,690d5c1b082251ecf9cd20ffbb5d4067a5d0009b,[Women],"Washington, DC",Women in Security and Privacy,https://www.wisporg.com/,,,,,AMER-East,,
201,BDPA DC,https://bdpadc.org,[https://twitter.com/@bdpadc],[General Technology],https://bdpa.org/wp-content/uploads/2020/07/bd...,7b7b65c4c483e551230743336891399e53d2077e,[BIPOC],"Washington, DC",Black Data Processing Association,https://www.bdpa.org,,,,,AMER-East,,
241,Girls in Tech Seattle,https://seattle.girlsintech.org,,[General Technology],https://kjaymiller.s3-us-west-2.amazonaws.com/...,90f3e05716263bbe5ec3f6118237c1d4b08230c1,[Women],"Washington, DC",Girls in Tech,https://girlsintech.com,,,,,,,
289,Blacks in Technology - Washington D.C.,https://www.meetup.com/Washington-Information-...,,[General Technology],https://kjaymiller.s3-us-west-2.amazonaws.com/...,a9b0405ffc7907bc8d9bbd79d7d009c857722592,[BIPOC],"Washington, DC",Blacks in Technology Foundation,https://blacksintechnology.net,,,,,,,
329,Girls in Tech Washington,https://dc.girlsintech.org,,[General Technology],https://kjaymiller.s3-us-west-2.amazonaws.com/...,a4581bb0b786f7ef173263e3b87add36702841a4,[Women],"Washington, DC",Girls in Tech,https://girlsintech.com,,,,,,,
393,Blacks in Cybersecurity UDC,https://www.blacksincyberconf.com,,[Security],https://static.wixstatic.com/media/05d08c_80fb...,ad3f93e1254208567cdc2d6c2e15934a7b1029b5,[BIPOC],"Washington, DC",Blacks in CyberSecurity,https://www.blacksincyberconf.com,,,,,AMER-East,,
395,Blacks in Cybersecurity Howard University,https://www.blacksincyberconf.com,,[Security],https://static.wixstatic.com/media/05d08c_80fb...,ef39965bda129f81aec6a13b15fd516389498a1f,[BIPOC],"Washington, DC",Blacks in CyberSecurity,https://www.blacksincyberconf.com,,,,,AMER-East,,
