## Inconsistent Data Entry
Dataset: Pakistan intellectual capital

In [1]:
# importing necessary modules
import pandas as pd
import numpy as np
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

In [2]:
# loading data
capital = pd.read_csv("Datasets/pakistan_intellectual_capital.csv")
capital.head()

Unnamed: 0.1,Unnamed: 0,S#,Teacher Name,University Currently Teaching,Department,Province University Located,Designation,Terminal Degree,Graduated from,Country,Year,Area of Specialization/Research Interests,Other Information
0,2,3,Dr. Abdul Basit,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,Software Engineering & DBMS,
1,4,5,Dr. Waheed Noor,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,DBMS,
2,5,6,Dr. Junaid Baber,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,"Information processing, Multimedia mining",
3,6,7,Dr. Maheen Bakhtyar,University of Balochistan,Computer Science & IT,Balochistan,Assistant Professor,PhD,Asian Institute of Technology,Thailand,,"NLP, Information Retrieval, Question Answering...",
4,24,25,Samina Azim,Sardar Bahadur Khan Women's University,Computer Science,Balochistan,Lecturer,BS,Balochistan University of Information Technolo...,Pakistan,2005.0,VLSI Electronics DLD Database,


In [3]:
# unique values of Country column
countries = capital['Country'].unique()
countries.sort()
countries

array([' Germany', ' New Zealand', ' Sweden', ' USA', 'Australia',
       'Austria', 'Canada', 'China', 'Finland', 'France', 'Greece',
       'HongKong', 'Ireland', 'Italy', 'Japan', 'Macau', 'Malaysia',
       'Mauritius', 'Netherland', 'New Zealand', 'Norway', 'Pakistan',
       'Portugal', 'Russian Federation', 'Saudi Arabia', 'Scotland',
       'Singapore', 'South Korea', 'SouthKorea', 'Spain', 'Sweden',
       'Thailand', 'Turkey', 'UK', 'USA', 'USofA', 'Urbana', 'germany'],
      dtype=object)

In [4]:
# converting values to lower case and removing extra white spaces
capital['Country'] = capital['Country'].str.lower()
capital['Country'] = capital['Country'].str.strip()

In [5]:
countries = capital['Country'].unique()
countries.sort()
countries

array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
       'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
       'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
       'norway', 'pakistan', 'portugal', 'russian federation',
       'saudi arabia', 'scotland', 'singapore', 'south korea',
       'southkorea', 'spain', 'sweden', 'thailand', 'turkey', 'uk',
       'urbana', 'usa', 'usofa'], dtype=object)

In [6]:
# finding matches for south korea
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
matches

[('south korea', 100),
 ('southkorea', 48),
 ('saudi arabia', 43),
 ('norway', 35),
 ('austria', 33),
 ('ireland', 33),
 ('pakistan', 32),
 ('portugal', 32),
 ('scotland', 32),
 ('australia', 30)]

In [7]:
# function to find matching of a particular value
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    strings = df[column].unique()
    
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    rows_with_matches = df[column].isin(close_matches)
 
    df.loc[rows_with_matches, column] = string_to_match
    
    print("All done!")

In [8]:
replace_matches_in_column(df=capital, column='Country', string_to_match="south korea")

All done!


In [9]:
countries = capital['Country'].unique()
countries.sort()
countries

array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
       'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
       'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
       'norway', 'pakistan', 'portugal', 'russian federation',
       'saudi arabia', 'scotland', 'singapore', 'south korea', 'spain',
       'sweden', 'thailand', 'turkey', 'uk', 'urbana', 'usa', 'usofa'],
      dtype=object)

In [10]:
# getting Graduted from column 
loc = capital['Graduated from'].unique()
loc.sort()
loc

array([' Columbia University', ' Delft University of Technology',
       ' Iowa State University', ' University of Central Florida',
       ' University of Innsbruck',
       ' University of Texas at Arlington (UTA)', ' University of Turin',
       'Abasyn University', 'Abdul Wali Khan University, Mardan',
       'Abdus Salam School of Mathematical Sciences,GC University',
       'Agricultural University Peshawar', 'Allama Iqbal Open University',
       'Asian Institute of Technology', 'Aston University, Birmingham',
       'Australian National University, Caneberra', 'BUKC',
       'Bahauddin Zakariya University', 'Bahria University',
       'Bahria University,Islamabad',
       'Balochistan University of Information Technology, Engineering and Management Sciences',
       'Barani Institute of Information Technology',
       'Beaconhouse National University', 'Beihang University',
       'Beijing Institute of Technology',
       'Beijing Institute of Technology Beijing',
       'Beiji

In [11]:
capital['Graduated from'] = capital['Graduated from'].str.lower()
capital['Graduated from'] = capital['Graduated from'].str.strip()

In [12]:
replace_matches_in_column(df=capital, column='Graduated from', string_to_match="capital university of science & technology")

All done!


In [13]:
replace_matches_in_column(df=capital, column='Graduated from', string_to_match="government college university,faisalabad")

All done!


In [14]:
loc = capital['Graduated from'].unique()
loc.sort()
loc

array(['abasyn university', 'abdul wali khan university, mardan',
       'abdus salam school of mathematical sciences,gc university',
       'agricultural university peshawar',
       'asian institute of technology', 'aston university, birmingham',
       'australian national university, caneberra',
       'bahauddin zakariya university', 'bahria university',
       'bahria university,islamabad',
       'balochistan university of information technology, engineering and management sciences',
       'barani institute of information technology',
       'beaconhouse national university', 'beihang university',
       'beijing institute of technology',
       'beijing institute of technology beijing',
       'beijing university of posts & telecommunications',
       'biztek institute of business & technology,karachi',
       'blekinge institute of technology', 'boston university',
       'brock university canada', 'brunel university', 'bukc',
       'california state university',
       'cap