## Interesting usage of spell checker and TextBlob libraires for NLP

TextBlob would also performs sentiment analysis even if it is the objective here

In [96]:
import pandas as pd
import numpy as np
from manual_spellchecker import spell_checker
from textblob import TextBlob

df = pd.read_csv('data-to-clean.csv', 
                 sep=';',
                 names=['Product', 'Age', 'Gender', 'Education', 'MaritalStatus', 'Contract', 'Usage', 'Fitness', 'Income', 'Miles'])

In [97]:
df.describe()

Unnamed: 0,Age,Education,Usage,Fitness,Income,Miles
count,180.0,180.0,180.0,180.0,180.0,180.0
mean,28.788889,15.572222,3.455556,3.311111,53719.577778,103.194444
std,6.943498,1.617055,1.084797,0.958869,16506.684226,51.863605
min,18.0,12.0,2.0,1.0,29562.0,21.0
25%,24.0,14.0,3.0,3.0,44058.75,66.0
50%,26.0,16.0,3.0,3.0,50596.5,94.0
75%,33.0,16.0,4.0,4.0,58668.0,114.75
max,50.0,21.0,7.0,5.0,104581.0,360.0


In [98]:
df.head()

Unnamed: 0,Product,Age,Gender,Education,MaritalStatus,Contract,Usage,Fitness,Income,Miles
0,TM195,18,Male,14,Single,Business,3,4,29562,112
1,TM195,19,Male,15,Single,Salried,2,3,31836,75
2,TM195,19,Femal,14,Partnered,Salaried,4,3,30699,66
3,TM195,19,Male,12,Single,Salaried,3,3,32973,85
4,TM195,20,Male,13,Partnered,Sallrried,4,2,35247,47


In [99]:
sc = spell_checker(df, 'Gender')
sc.spell_check()


Analyzing suspected errors


  0%|          | 0/180 [00:00<?, ?it/s]


Total suspected errors =  11


## Where are the errors ?

Spell checker allows us to retrieve potential errors

In [100]:
sc.get_all_errors()

['Femal',
 'Femal',
 'Femal',
 'Femal',
 'Mal',
 'Femal',
 'Mal',
 'mal',
 'Femal',
 'Mal',
 'femal']

## TextBlob usage

Note : do not forget to lower the text

In [101]:
df['Gender'] = df['Gender'].apply(lambda txt: ''.join(TextBlob(txt.lower()).correct()))

In [102]:
print(df['Gender'])

0        male
1        male
2      female
3        male
4        male
        ...  
175      male
176      male
177      male
178      male
179      male
Name: Gender, Length: 180, dtype: object


## Check values for Gender

In [103]:
df['Gender'].unique()


array(['male', 'female', 'mal'], dtype=object)

## Observations

'female' is correct now
but 'mal' is not expected even if it is an english word
=> spell checker or TextBlob won't help us here
=> we need to replace 'mal' per 'male'


In [104]:
df['Gender']=np.where(df['Gender'] =='mal', 'male', df['Gender'])

## Check again values for Gender

In [106]:
df['Gender'].unique()

array(['male', 'female'], dtype=object)

## Save to csv

In [105]:
df.to_csv('data-cleaned.csv')