In [32]:
import pandas as pd
df = pd.read_csv("Survey_data.csv")
df.head()

Unnamed: 0,ID,Age,Income,Property,Num of pets in household
0,14578,75,20000,N,1.0
1,14615,72,32000,N,2.0
2,14652,54,2000,no,4.0
3,14689,81,29000,no,1.0
4,14726,37,2000,yes,4.0


In [33]:
df.shape
#103 tells your rows, 5 tells you columns, format (x,y)
#this is good to check duplicates quickly if you have unique ID, you should have 103 unique IDs as well

(103, 5)

In [34]:
#So we can use a function to see this
df["ID"].nunique()
#answer is 100, meaning duplicated entries present

100

In [35]:
#find the duplicated rows, but does not single them out in a large data set
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
98      True
99     False
100    False
101    False
102    False
Length: 103, dtype: bool

In [36]:
#we can return rows where we have = True as the evaluation, making another df representing the duplicates
duplicates = df[df.duplicated()]
print(duplicates)


       ID  Age  Income Property  Num of pets in household
29  15614   73   46000       NO                       2.0
80  17464   64   24000      YES                       1.0
98  18093   77   31000      YES                       4.0


In [37]:
#remove duplicates now
df = df.drop_duplicates()
df.shape

#now there are only 103 rows so the duplicates have been found and removed

(100, 5)

In [38]:
#Now looking at incomplete records
df.isnull().sum()

ID                          0
Age                         0
Income                      0
Property                    0
Num of pets in household    2
dtype: int64

Now to determine why there are missing variables in this column, hard to draw conslusions, so is it classified as MCAR? or MAR

So we can use imputation techniques like deletion or mean imputation

In [39]:
#MCAR so mean imputation
#First we need average of all pets

df["Num of pets in household"].mean()


2.306122448979592

Decimal number, we need to round

In [40]:
#Use round function and make variable
avg_num_pets = round(df["Num of pets in household"].mean(),0)

In [41]:
#fillna to fill in missing values using mean
df = df.fillna("avg_num_pets")

In [43]:
#property column is mixed cases with NO, y, yes, n
#so we need to standardise the variables
df["Property"].unique()

array(['N', 'no', 'yes', 'YES', 'NO', 'Y'], dtype=object)

In [44]:
#We have 6 different types of answer in this
#we should first convert to all same case
df["Property"] = df["Property"].str.lower()

In [45]:
#now we only have 4 unique entries
df["Property"].unique()

array(['n', 'no', 'yes', 'y'], dtype=object)

In [47]:
#So with n and y, we could do string indexing, or replace function
#we want to place "n" to no and "y" to yes
#Using inplace means this change occurs in the actual dataframe
df.replace("n", "no", inplace = True)
df.replace("y", "yes", inplace = True)

#now we only have 3 unique entries
df["Property"].unique()

array(['no', 'yes'], dtype=object)

In [None]:
#In larger data sets, we can use other strategies to clean data rather than replacing