how to clean up inconsistent text entries

In [3]:
import pandas as pd
import numpy as np

In [4]:
import fuzzywuzzy
from fuzzywuzzy import process 
import chardet

In [6]:
data = pd.read_csv("PakistanSuicideAttacks Ver 11 (30-November-2017).csv")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 0: invalid start byte

Got UnicodeDecodeError! 
So, what character encoding is it?

In [14]:
with open("PakistanSuicideAttacks Ver 11 (30-November-2017).csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(1000000))
result

{'confidence': 0.751324958743095, 'encoding': 'ISO-8859-2'}

In [15]:
data = pd.read_csv(
    "PakistanSuicideAttacks Ver 11 (30-November-2017).csv",
    encoding = 'ISO-8859-2')

Data got loaded!

In [16]:
data.sample()

Unnamed: 0,S#,Date,Islamic Date,Blast Day Type,Holiday Type,Time,City,Latitude,Longitude,Province,...,Targeted Sect if any,Killed Min,Killed Max,Injured Min,Injured Max,No. of Suicide Blasts,Explosive Weight (max),Hospital Names,Temperature(C),Temperature(F)
495,496,Saturday-November-25-2017,6 Rabi Al-Awwal 1439 A.H,Working Day,,7:00 AM,Quetta,30.139626,66.981476,Balochistan,...,,4.0,4.0,16.0,22,1.0,10KG,Civil Hospital(Quetta),29.0,84.0


### Preliminary text pre-processing

Clean up 'City' column to make sure there is no data entry inconsistencies in it. 

In [17]:
# Get all the unique values from the City column 
cities = data['City'].unique()

# sort it alphabetically
cities.sort()

cities

array(['ATTOCK', 'Attock ', 'Bajaur Agency', 'Bannu', 'Bhakkar ', 'Buner',
       'Chakwal ', 'Chaman', 'Charsadda', 'Charsadda ', 'D. I Khan',
       'D.G Khan', 'D.G Khan ', 'D.I Khan', 'D.I Khan ', 'Dara Adam Khel',
       'Dara Adam khel', 'Fateh Jang', 'Ghallanai, Mohmand Agency ',
       'Gujrat', 'Hangu', 'Haripur', 'Hayatabad', 'Islamabad',
       'Islamabad ', 'Jacobabad', 'KURRAM AGENCY', 'Karachi', 'Karachi ',
       'Karak', 'Khanewal', 'Khuzdar', 'Khyber Agency', 'Khyber Agency ',
       'Kohat', 'Kohat ', 'Kuram Agency ', 'Lahore', 'Lahore ',
       'Lakki Marwat', 'Lakki marwat', 'Lasbela', 'Lower Dir', 'MULTAN',
       'Malakand ', 'Mansehra', 'Mardan', 'Mohmand Agency',
       'Mohmand Agency ', 'Mohmand agency', 'Mosal Kor, Mohmand Agency',
       'Multan', 'Muzaffarabad', 'North Waziristan', 'North waziristan',
       'Nowshehra', 'Orakzai Agency', 'Peshawar', 'Peshawar ', 'Pishin',
       'Poonch', 'Quetta', 'Quetta ', 'Rawalpindi', 'Sargodha',
       'Sehwan town',

In [19]:
len(cities)

93

Inconssistent data entry examples: 
- 'ATTOCK', 'Attock '
- 'Charsadda', 'Charsadda '
- 'Lakki Marwat', 'Lakki marwat'
- 'Mohmand Agency', 'Mohmand Agency ', 'Mohmand agency', 'Mosal Kor, Mohmand Agency',
<br>
<br>

To fix this:

In [18]:
# change everything to lower case 
data['City'] = data['City'].str.lower()

# remove tailing white spaces
data['City'] = data['City'].str.strip()

In [20]:
cities = data['City'].unique()

In [21]:
len(cities)

67

After converting all text to lowercase and removing white space, now, unique value became 67 from 93.


In [22]:
cities

array(['islamabad', 'karachi', 'quetta', 'rawalpindi', 'north waziristan',
       'kohat', 'attock', 'sialkot', 'lahore', 'swat', 'hangu', 'bannu',
       'lasbela', 'malakand', 'peshawar', 'd.i khan', 'lakki marwat',
       'tank', 'gujrat', 'charsadda', 'kuram agency', 'shangla',
       'bajaur agency', 'south waziristan', 'haripur', 'sargodha',
       'nowshehra', 'mohmand agency', 'dara adam khel', 'khyber agency',
       'mardan', 'bhakkar', 'orakzai agency', 'buner', 'd.g khan',
       'pishin', 'chakwal', 'upper dir', 'muzaffarabad', 'totalai',
       'multan', 'lower dir', 'sudhanoti', 'poonch', 'mansehra', 'karak',
       'swabi', 'shikarpur', 'sukkur', 'chaman', 'd. i khan', 'khanewal',
       'fateh jang', 'taftan', 'tirah valley', 'wagah', 'zhob',
       'kurram agency', 'taunsa', 'jacobabad', 'shabqadar-charsadda',
       'khuzdar', 'ghallanai, mohmand agency', 'hayatabad',
       'mosal kor, mohmand agency', 'sehwan town',
       'tangi, charsadda district'], dtype=object

However, there are still remaiing inconsistency such as 'd.i khan', 'd. i khan'

In [24]:
# Use fuzzy to identify which string are closest to each other
# get the top 5 closest matches to 'd.i khan'

matches = fuzzywuzzy.process.extract('d.i khan', cities, limit = 5,
                                    scorer = fuzzywuzzy.fuzz.token_set_ratio)

matches

[('d.i khan', 100),
 ('d. i khan', 100),
 ('d.g khan', 88),
 ('khanewal', 50),
 ('sudhanoti', 47)]

In [28]:
# replace str in City col that are similar to 'd.i khan'

def replace_matches(dataframe, colName, stringToMatch, minRatio):
    
    # get unique strings from a column
    uniqueStrInCol = dataframe[colName].unique()
    
    # get the top 5 closest matches to target string to match
    matches = fuzzywuzzy.process.extract(stringToMatch, uniqueStrInCol, limit = 5,
                                    scorer = fuzzywuzzy.fuzz.token_set_ratio)
    
    # list of strings that cloesly matches (ratio > minRatio)
    closeMatches = [matches[0] for matches in matches if matches[1] >= minRatio]
    
    # replace the closeMatches with 
    # --> DataFrame.isin(values) returns boolean whether each element in the DataFrame contains in values
    closeMatches_rows = dataframe[colName].isin(closeMatches)
    # --> df.loc[rows, column]
    dataframe.loc[closeMatches_rows, colName] = stringToMatch
    
    print("done")    

In [29]:
# replace all rows in City column that ratio > 90 with 'd.i khan'
replace_matches(data, 'City', 'd.i khan' , 90)

done


Let's check if it worked

In [31]:
cities = data['City'].unique()
print (len(cities))
cities.sort()
cities

66


array(['attock', 'bajaur agency', 'bannu', 'bhakkar', 'buner', 'chakwal',
       'chaman', 'charsadda', 'd.g khan', 'd.i khan', 'dara adam khel',
       'fateh jang', 'ghallanai, mohmand agency', 'gujrat', 'hangu',
       'haripur', 'hayatabad', 'islamabad', 'jacobabad', 'karachi',
       'karak', 'khanewal', 'khuzdar', 'khyber agency', 'kohat',
       'kuram agency', 'kurram agency', 'lahore', 'lakki marwat',
       'lasbela', 'lower dir', 'malakand', 'mansehra', 'mardan',
       'mohmand agency', 'mosal kor, mohmand agency', 'multan',
       'muzaffarabad', 'north waziristan', 'nowshehra', 'orakzai agency',
       'peshawar', 'pishin', 'poonch', 'quetta', 'rawalpindi', 'sargodha',
       'sehwan town', 'shabqadar-charsadda', 'shangla', 'shikarpur',
       'sialkot', 'south waziristan', 'sudhanoti', 'sukkur', 'swabi',
       'swat', 'taftan', 'tangi, charsadda district', 'tank', 'taunsa',
       'tirah valley', 'totalai', 'upper dir', 'wagah', 'zhob'], dtype=object)

Now we only have "d.i khan" in our dataframe and we didn't have to change anything by hand.