### Tasks
Part 1: Upload the data

Part 2: Analyze your data and create a plan for data preparation

Part 3: Data cleansing (missing values, outliers, duplicates, data consistently)

Part 4: Encode categorical data

Part 5: Upload the deliverables to GitHub

In [111]:
import pandas as pd
import numpy as np
df = pd.read_csv('data_cleaning.csv',low_memory=False)

### Data Inspection

In [112]:
df.shape

(30, 7)

In [113]:
# description of index, entries, columns, data types, memory info
# conclusion: some null data, birthyear type is float (weird)
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionID  30 non-null     int64  
 1   ClientID       30 non-null     int64  
 2   BirthYear      28 non-null     float64
 3   Amount         30 non-null     int64  
 4   Profession     28 non-null     object 
 5   Department     30 non-null     int64  
 6   Risk           30 non-null     object 
dtypes: float64(1), int64(4), object(2)
memory usage: 1.8+ KB


In [114]:
df.head(10)
# conclusion: same department, case sensitivity in Profesion and outliers (amount)

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
0,4,34985,1923.0,5670,manager,78,Low
1,16,34997,1923.0,2399090,developer,78,High
2,25,35006,1923.0,33050,HR,78,High
3,12,34993,1939.0,23430,professor,78,Low
4,21,35002,1939.0,16770,manager,78,Low
5,11,34992,1944.0,21210,researcher,78,Medium
6,20,35001,1944.0,14550,student,78,Medium
7,3,34984,1945.0,3450,student,78,Medium
8,19,35000,1949.0,12330,barmen,78,High
9,9,34990,1953.0,16770,Manager,78,Medium


In [115]:
df.columns

Index(['TransactionID', 'ClientID', 'BirthYear', 'Amount', 'Profession',
       'Department', 'Risk'],
      dtype='object')

In [116]:
# show unique values of a column
# Need harmonization within the labels:
# 'manager', 'Manager'
# 'student','edudient','Student'
# 'bdm','BDM'
df["Profession"].unique()

array(['manager', 'developer', 'HR', 'professor', 'researcher', 'student',
       'barmen', 'Manager', 'bdm', nan, 'hr', 'etudient', 'BDM',
       'Hairdresser', 'Student', 'Driver', 'sailer'], dtype=object)

In [117]:
df["Risk"].unique()
# good quality

array(['Low', 'High', 'Medium'], dtype=object)

### CLEANING ACTION PLAN 

1. Deal with NaN values
2. Formatting BirthYear type into integer
3. Clean Profession labels 
4. Check consistency & duplicates
5. Remove outliers

In [118]:
# Deal with NaN values
null_displ = df[(df['BirthYear'].isnull()==True) | (df['Profession'].isnull()==True)]
null_displ

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
15,28,35008,1967.0,46370,,78,High
16,29,35008,1976.0,50810,,78,Medium
28,22,34987,,18990,sailer,78,High
29,7,34988,,12330,Manager,78,Medium


In [119]:
# Let's replace NaN year value by 0 and NaN profession value by unknown
df['BirthYear'].fillna(0,inplace=True)
df['Profession'].fillna('unknown',inplace=True)

In [120]:
null_displ = df[(df['BirthYear'].isnull()==True) | (df['Profession'].isnull()==True)]
null_displ
# no more null values left

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk


In [121]:
# 1. Formatting BirthYear type into integer
df['BirthYear'].dtype

dtype('float64')

In [122]:
df['BirthYear'] = df['BirthYear'].astype('int')
df['BirthYear'].dtype

dtype('int32')

In [123]:
# Clean Profesion labels 
# 'manager', 'Manager'
# 'student','etudient','Student'
# 'bdm','BDM'

In [124]:
#lowering string to lowercase to deal with BDM bdm etc.
df['Profession'] = df['Profession'].astype('str')
df['Risk'] = df['Risk'].astype('str')
df['Profession'] = df['Profession'].str.lower()
df['Risk'] = df['Risk'].str.lower()

In [125]:
df['Profession'].unique()

array(['manager', 'developer', 'hr', 'professor', 'researcher', 'student',
       'barmen', 'bdm', 'unknown', 'etudient', 'hairdresser', 'driver',
       'sailer'], dtype=object)

In [132]:
df['Profession'] = df['Profession'].replace('etudient','student')

In [133]:
df['Profession'].unique()

array(['manager', 'developer', 'hr', 'professor', 'researcher', 'student',
       'barmen', 'bdm', 'unknown', 'hairdresser', 'driver', 'sailer'],
      dtype=object)

In [134]:
# Check consistency & duplicates
# tackling illegal workers
# removing no birthyear individuals
# removing duplicates

In [144]:
# Spot illegal workers and remove them LOL 
drop_rows = df[ (df['Profession'] != 'student') & (df['BirthYear'] > 2004) | (df['BirthYear']==0)].index
drop_rows

Int64Index([26, 27, 28, 29], dtype='int64')

In [145]:
# HORS DE MA VUE
df_filtered = df.drop(drop_rows, axis=0)

In [146]:
df_filtered

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
0,4,34985,1923,5670,manager,78,low
1,16,34997,1923,2399090,developer,78,high
2,25,35006,1923,33050,hr,78,high
3,12,34993,1939,23430,professor,78,low
4,21,35002,1939,16770,manager,78,low
5,11,34992,1944,21210,researcher,78,medium
6,20,35001,1944,14550,student,78,medium
7,3,34984,1945,3450,student,78,medium
8,19,35000,1949,12330,barmen,78,high
9,9,34990,1953,16770,manager,78,medium


In [155]:
# df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

pivot = df_filtered.groupby('ClientID').agg({'TransactionID':'count'}).sort_values(by='TransactionID',ascending=False)
pivot.head()

Unnamed: 0_level_0,TransactionID
ClientID,Unnamed: 1_level_1
35008,3
34987,2
34989,2
34991,2
34997,1


In [158]:
sus_clients = pivot.head(4).index
sus_clients

Int64Index([35008, 34987, 34989, 34991], dtype='int64', name='ClientID')

In [160]:
df_filtered.loc[df['ClientID'].isin(sus_clients)].sort_values(by="ClientID",ascending=False)

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
14,27,35008,1967,41930,bdm,78,low
15,28,35008,1967,46370,unknown,78,high
16,29,35008,1976,50810,unknown,78,medium
19,10,34991,1988,18990,bdm,78,low
20,30,34991,1988,55250,bdm,78,high
10,8,34989,1958,14550,hr,78,high
13,24,34989,1967,27870,hr,78,medium
11,6,34987,1967,10110,manager,78,medium
12,15,34987,1967,30090,manager,78,low
