In [1]:
import pandas as pd
from fuzzywuzzy import process, fuzz



In [2]:
df = pd.read_json('updated_diversity_orgs.json').fillna('')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 17 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   name                                     494 non-null    object
 1   url                                      494 non-null    object
 2   links                                    494 non-null    object
 3   technology_focus                         494 non-null    object
 4   organization_logo                        494 non-null    object
 5   id                                       494 non-null    object
 6   diversity_focus                          494 non-null    object
 7   city                                     494 non-null    object
 8   parent_organization                      494 non-null    object
 9   global_org_url_from_parent_organization  494 non-null    object
 10  twitter                                  494 non-null    objec

In [4]:
for col in df[['name','city']]:
    df[col] = df[col].str.strip()
    print('Number of unique values in ' + str(col) +': ' + str(df[col].nunique()))

Number of unique values in name: 494
Number of unique values in city: 324


In [5]:
# This shows where duplicates may exist
def replacements(df=df):
    """generate a list of unique values group them based on similarity"""
    unique_city = df['city'].unique().tolist()
    score_sort = [(x,) + i
                 for x in unique_city 
                 for i in process.extract(x, unique_city, scorer=fuzz.token_sort_ratio)]
    #Create a dataframe from the tuples
    similarity_sort = pd.DataFrame(score_sort, columns=['city_sort','match_sort','score_sort'])
    return similarity_sort[similarity_sort['score_sort'].between(80, 99)]

In [6]:
groups = replacements().groupby(by=['match_sort'])
groups.count()[groups.count()['city_sort'] >= 1]



Unnamed: 0_level_0,city_sort,score_sort
match_sort,Unnamed: 1_level_1,Unnamed: 2_level_1
"Belfast, United Kingdom",1,1
Birmingham,1,1
"Birmingham, USA",1,1
"Bristol, United Kingdom",1,1
"Cincinnati, United States",1,1
"Delhi, India",1,1
Denver,1,1
"Denver, CO",1,1
"Indianapolis, United States",1,1
"Knoxville, Tennessee",1,1


In [7]:
# Get the List and Check Against a City
city = 'New York'
unique_list = replacements()
unique_list[unique_list['match_sort'].str.contains(city)]



Unnamed: 0,city_sort,match_sort,score_sort


In [8]:
def unify(df, match_series, city_sort_value):
    """
    given a dataframe(df) and match_series,
    set all values of the dataframe to the city_sort_value
    """
    matcher = match_series[match_series['match_sort'] == city_sort_value]['city_sort']
    df.loc[df['city'].isin(matcher), ['city']] = city_sort_value

In [9]:
cities = [
    'Washington, DC',
    'Atlanta, GA',
    'Austin, TX',
    'Boston, MA',
    'Los Angeles, CA'
    'Houston, TX',
    'Cleveland, OH',
    'San Diego, CA',
    'Dallas, TX',
    'Portland, OR',
    'Raleigh-Durham, NC',
    'San Francisco, CA',
    'Seattle, WA',
    'Rio De Janeiro, Brazil',
    'London, UK',
    'Twin Cities, USA',
    'São Paulo, Brazil',
    'Tel Aviv, Israel',
    'New York City, NY'
    ]

In [10]:
# Choose a value to match against and change all found values to that assigned value
for city in cities:
    unify(df, unique_list, city)

In [11]:
# To Make a Brute Force change
df.loc[df['city'].str.contains('London'), ['city']] = "London, United Kingdom"

In [12]:
# To check a value in the DF
df[df['city'].str.contains("London")]

Unnamed: 0,name,url,links,technology_focus,organization_logo,id,diversity_focus,city,parent_organization,global_org_url_from_parent_organization,twitter,github,website,meetup,region,notes,donate
1,Coding Black Females,https://codingblackfemales.com,[https://www.instagram.com/codingblackfemales/...,"[General Technology, Job Board]",https://codingblackfemales.com/img/cbf_logo.png,a4dc6f83a6217fbb023ea38d5dc740e2ca440bca,"[Women, BIPOC]","London, United Kingdom",,,,,,,,,
6,UK Black Tech - London,https://ukblacktech.com/london-hub/,"[https://www.instagram.com/ukblacktech/, https...","[General Technology, Job Board]",https://ukblacktech.com/wp-content/uploads/201...,5dc2304d7517bb8ea6977f8515ca8996ec8f38b3,[BIPOC],"London, United Kingdom",UK Black Tech,,,,,,,,
10,YSYS,https://www.thisisysys.com,"[https://twitter.com/ThisIsYSYS, https://www.l...","[Job Board, General Technology, Industry Netwo...",,343b6e3e20d7ccecb4d81626fc7f793b0e18d1ff,,"London, United Kingdom",,,,,,,,,
22,GirlCode Berlin,https://www.girl-code.co.uk/,[https://join.slack.com/t/girlcodetalk/shared_...,[General Technology],https://static.wixstatic.com/media/5ca287_17f9...,doc-6039409131f00721401abfa3,[Women],"London, United Kingdom",GirlCode,https://www.girl-code.co.uk/,,,,,,,
26,GirlCode London,https://www.girl-code.co.uk/,[https://join.slack.com/t/girlcodetalk/shared_...,[General Technology],https://static.wixstatic.com/media/5ca287_17f9...,doc-6039409131f00721401abfa1,[Women],"London, United Kingdom",GirlCode,https://www.girl-code.co.uk/,,,,,,,
106,PyLadies London,https://pyladies.com/locations/,"[https://www.meetup.com/pyladieslondon/, https...",[Python],https://pyladies.com/assets/images/pyladies_lo...,22b304741361277ebdda632af1ffacab51cbee24,[Women],"London, United Kingdom",PyLadies,https://pyladies.com,,,,,,,
114,Women Who Go - London,http://www.meetup.com/Women-Who-Go-London/,"[http://www.meetup.com/Women-Who-Go-London/, h...",[Go],http://www.womenwhogo.org/assets/images/wwglog...,613e79eb04c2c35d30057eae70d927bab68f2210,[Women],"London, United Kingdom",Women Who Go,https://womenwhogo.org,,,,,,,
119,London Latinas in Tech,https://www.latinasintech.org/chapter/london,,[General Technology],https://www.latinasintech.org/wp-content/uploa...,c2b888e89cd2835ba42d46378b15defb3b25bb8f,"[Latinx, Women]","London, United Kingdom",Latinas in Tech,https://www.latinasintech.org/,,,,,EMEA-UKN,,
124,Django Girls London,http://djangogirls.org/london/,[https://twitter.com/@DjangoGirlsLDN],"[Django, Python]",https://github.com/DjangoGirls/resources/blob/...,5c7946ee2e6f269718bfc167f00d049917286932,[Women],"London, United Kingdom",Django Girls,http://djangogirls.org,,,,,EMEA-UKN,,
188,DjangoGirls PyCon UK,https://djangogirls.org/pyconuk/,[https://twitter.com/@DjangoGirlsPyUK],"[Django, Python]",https://github.com/DjangoGirls/resources/blob/...,ba3345202e68e2cf817a6bfab143942e2e9ceb0e,[Women],"London, United Kingdom",Django Girls,http://djangogirls.org,,,,,EMEA-UKN,,


In [13]:
# Save your Work
df.to_json('updated_diversity_orgs.json', orient="records")