In [2]:
import pandas as pd
from fuzzywuzzy import process, fuzz



In [3]:
df = pd.read_json('diversityorgs.tech.json').fillna('')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 17 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   name                                     494 non-null    object
 1   url                                      494 non-null    object
 2   links                                    494 non-null    object
 3   technology_focus                         494 non-null    object
 4   organization_logo                        494 non-null    object
 5   id                                       494 non-null    object
 6   diversity_focus                          494 non-null    object
 7   city                                     494 non-null    object
 8   parent_organization                      494 non-null    object
 9   global_org_url_from_parent_organization  494 non-null    object
 10  twitter                                  494 non-null    objec

In [5]:
for col in df[['name','city']]:
    df[col] = df[col].str.strip()
    print('Number of unique values in ' + str(col) +': ' + str(df[col].nunique()))

Number of unique values in name: 494
Number of unique values in city: 367


In [126]:
# This shows where duplicates may exist
def replacements():
    unique_city = df['city'].unique().tolist()
    score_sort = [(x,) + i
                 for x in unique_city 
                 for i in process.extract(x, unique_city, scorer=fuzz.token_sort_ratio)]
    #Create a dataframe from the tuples
    similarity_sort = pd.DataFrame(score_sort, columns=['city_sort','match_sort','score_sort'])
    return similarity_sort[similarity_sort['score_sort'].between(80, 99)]

In [129]:
groups = replacements().groupby(by=['match_sort'])
groups.count()[groups.count()['city_sort'] >= 4]



Unnamed: 0_level_0,city_sort,score_sort
match_sort,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington D.C.,4,4
"Washington, D.C., US",4,4
"Washington, US",4,4


In [123]:
# Get the List and Check Against a Value
unique_list = replacements()
unique[unique_list['match_sort'].str.contains('Houston')]

Unnamed: 0,city_sort,match_sort,score_sort
281,"Houston, Texas","Houston, TX",87
981,Houston,"Houston, TX",82
1341,"Houston, TX","Houston, Texas",87
1342,"Houston, TX",Houston,82


In [124]:
def unify(df, match_series, city_sort_value):
    """given a dataframe(df) and match_series, set all values of the dataframe to the city_sort_value"""
    matcher = match_series[match_series['match_sort'] == city_sort_value]['city_sort']
    df.loc[df['city'].isin(matcher), ['city']] = city_sort_value
    return df[df['city'] == city_sort_value]

In [125]:
unify(df, replacements, 'Houston, TX')

Unnamed: 0,name,url,links,technology_focus,organization_logo,id,diversity_focus,city,parent_organization,global_org_url_from_parent_organization,twitter,github,website,meetup,region,notes,donate
62,Girl Develop It! Houston,http://girldevelopit.com/,[https://twitter.com/@gdihouston],[Youth Education],https://pbs.twimg.com/profile_images/123317612...,ece4e2c91e2fe42f6ccc5bc5b33a72498ca7c013,[Women],"Houston, TX",Girl Develop It!,https://girldevelopit.com,,,,,AMER-C-South,,
170,Houston Latinas in Tech,https://www.latinasintech.org/chapter/houston,,[General Technology],https://www.latinasintech.org/wp-content/uploa...,7cbcbeffc60d73407ad573263dddeb622a04d592,"[Latinx, Women]","Houston, TX",Latinas in Tech,https://www.latinasintech.org/,,,,,AMER-C-South,,
255,Blacks in Technology - Houston,https://www.meetup.com/bithouston/,,[General Technology],https://kjaymiller.s3-us-west-2.amazonaws.com/...,ca08bad6905889e97f1088a3a54ffc14e96c4737,[BIPOC],"Houston, TX",Blacks in Technology Foundation,https://blacksintechnology.net,,,,,,,
314,BUILT Houston,https://www.meetup.com/blacks-united-in-leadin...,[https://www.meetup.com/blacks-united-in-leadi...,[General Technology],https://builtinternational.org/wp-content/uplo...,a1488eaa73993cea89b28162ccede9a7b983beed,[BIPOC],"Houston, TX",Blacks United in Leading Technology International,https://builtinternational.org,,,,,,,
347,PyLadies Houston,https://pyladies.com/locations/,"[https://www.meetup.com/Houston_PyLadies/, htt...",[Python],https://pyladies.com/assets/images/pylady_geek...,b966e1bcfac096984cf62f2d440caf61a56ab0ff,[Women],"Houston, TX",PyLadies,https://pyladies.com,,,,,,,
374,Blacks in Cybersecurity University of Houston,https://www.blacksincyberconf.com,,[Security],https://static.wixstatic.com/media/05d08c_80fb...,3e3fad5cded6fe12d4ae60e8e79d2634b679ce84,[BIPOC],"Houston, TX",Blacks in CyberSecurity,https://www.blacksincyberconf.com,,,,,AMER-C-South,,
399,Tech for the Culture,,[https://twitter.com/@techculturetx],[General Technology],,86dad0c1191ca3c70cb6980e5d48d10a0bb10dad,[BIPOC],"Houston, TX",,,,,,,AMER-C-South,Reggie (One of the organizers) just recently j...,
