In [1]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# read in all our data
professors = pd.read_csv("PakistanIntellectualCapital.csv")

# set seed for reproducibility
np.random.seed(0)

professors.head()

Unnamed: 0,S#,Teacher Name,University Currently Teaching,Department,Province University Located,Designation,Terminal Degree,Graduated from,Country,Year,Area of Specialization/Research Interests,"Other Information""",Unnamed: 12
0,0,Dr. Ihsan Ullah,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,,France,,P2P Networks,"Telecommunication Systems""""",
1,1,Dr. Atiq Ahmed,University of Balochistan,Computer Science & IT,Balochistan,Associate Professor,PhD,,France,,Wireless Networks,Telecommunication Systems,
2,2,Dr. Abdul Basit,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,Software Engineering & DBMS,",\r\n3,Mr. Muhammad Khalid Badini,University o...",
3,4,Dr. Waheed Noor,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,DBMS,",\r\n5,Dr. Junaid Baber,University of Balochis...",
4,11,Mr. Jalaluddin,University of Balochistan,Computer Science & IT,Balochistan,Lecturer,Mphil,,,,Statistics & Computer Architecture,",\r\n12,Dr. Liaquat,University of Balochistan,...",


In [2]:
# get all the unique values in the 'Country' column
countries = professors['Country'].fillna("notacountry").unique()

# sort them alphabetically and then take a closer look
countries.sort()
countries

array([' Engineering and Management Sciences""', ' usOfa', 'France',
       'Pak istan', 'Pakistan', 'Thailand', 'notacountry', 'us ofa',
       'usof a', 'usofa'], dtype=object)

In [3]:
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()
# replace NaN
professors['Country'] = professors['Country'].fillna("notacountry")

professors['Country'].unique()

array(['france', 'thailand', 'notacountry',
       'engineering and management sciences""', 'usofa', 'pakistan',
       'us ofa', 'usof a', 'pak istan'], dtype=object)

In [4]:
# get the top 10 closest matches to "usofa"
matches = fuzzywuzzy.process.extract("usofa", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them
matches

[(' usOfa', 100),
 ('usofa', 100),
 ('usof a', 73),
 ('us ofa', 55),
 ('France', 36),
 ('Pakistan', 31),
 ('Pak istan', 29),
 ('notacountry', 25),
 ('Thailand', 15),
 (' Engineering and Management Sciences""', 5)]

In [5]:
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

In [6]:
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="usofa")

professors['Country'].unique()

All done!


array(['france', 'thailand', 'notacountry',
       'engineering and management sciences""', 'usofa', 'pakistan',
       'pak istan'], dtype=object)