# Pandas Data Cleaning

Some methods: `.isnull()`, `.drop()`, `.fillna()`, `._get_numeric_data()`, `.astype()`, `.str.replace()`, `.drop_duplicates()`, `.apply()`

---

In [None]:
import pandas as pd

---

### Working with `null` values

__NOTE:__ `NaN` is an alias for `null` in Pandas. `NaN` is used to represent missing or undefined numeric data in Pandas, while `None` should be, theoretically, used to represent missing or undefined non-numeric data. However, while `NaN` may not always be a meaningful representation of missing or undefined non-numeric data, it is a convenient and efficient way to represent missing or undefined data in a unified way across all data types in Pandas.

In [None]:
# Importing data

df1 = pd.read_csv('datasets/transaction_dataset.csv') 
# https://www.kaggle.com/vagifa/ethereum-frauddetection-dataset

df1.head()

In [None]:
# Explore dataset

df1.info()

In [None]:
# Null analysis

df1.isnull()

In [None]:
# Columns with null values

null_cols = df1.isnull().sum()
print(type(null_cols))
null_cols

In [None]:
# Serie slicing 

null_cols[null_cols > 0]

---

In [None]:
# Null analysis (null percentage)

null_cols = null_cols[null_cols > 5] / len(df1) * 100
null_cols

In [None]:
# Null analysis (columns with nulls)

threshold = 5
null_filter = null_cols > threshold
droped_cols = list(null_cols[null_filter].index)
droped_cols

In [None]:
# Null analysis (columns without nulls)

cols = df1.columns
#print(cols)     # Always check!!!

no_nulls_cols = cols.difference(droped_cols)
no_nulls_cols

---

#### Droping columns

In [None]:
# Drop unwanted columns

df_droped_nulls = df1.drop(droped_cols, axis=1)
df_droped_nulls.info()

In [None]:
# Suggested alternative!!!

df_no_nulls = df1[no_nulls_cols]
df_no_nulls.info()

---

#### Filling nulls

In [None]:
# Importing data

df2 = pd.read_csv('datasets/vehicles.csv')
df2.info()

> __But first, let's do some serious Slicing!!!__

In [None]:
# Slicing using .loc[]

null_displ = df2.loc[df2['displ'].isnull(), ['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
null_displ

In [None]:
# Slicing using .iloc[]

null_displ = df2.iloc[df2['displ'].isnull(), [63, 46, 47, 57, 24, 30, 22, 23]]
null_displ

In [None]:
# Slicing using []

null_displ = df2[df2['displ'].isnull()][['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
null_displ

In [None]:
# Filling nulls with a relevant value

df2[['displ', 'cylinders']] = df2[['displ', 'cylinders']].fillna(0)

In [None]:
zero_displ = df2[df2['displ'] == 0]
zero_displ = zero_displ[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
zero_displ

In [None]:
# Solving specific error

error = df2[(df2['cylinders']==0) & (df2['displ']!=0)]
error[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

In [None]:
df2.loc[(df2['cylinders']==0) & (df2['displ']!=0), 'cylinders'] = 4

In [None]:
df2[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']].iloc[21506]

---

### Get numeric data

In [None]:
# Get dataframe numeric data

df1._get_numeric_data()

In [None]:
# Equivalent form using the slicing operator

df1[[x for x in df1.columns if x in df1._get_numeric_data().columns]]

In [None]:
# Get dataframe NON numeric data

df1.drop(df1._get_numeric_data().columns, axis=1)

---

### Changing columns data types

In [None]:
# Import in time series data

df3 = pd.read_csv('datasets/power_consumption_es.csv') 
# https://www.kaggle.com/francoisraucent/western-europe-power-consumption

df3.info()

In [None]:
df3.head()

In [None]:
#Changing data types

df3['start'] = df3['start'].astype('datetime64[ns]')
df3['end'] = df3['end'].astype('datetime64[ns]')

In [None]:
df3.info()

---

### Simple string transformation (string replace)

In [None]:
# Importing data

df4 = pd.read_csv('datasets/club.csv', index_col=0) 
# https://www.kaggle.com/sanjeetsinghnaik/football-club-market-value-2021

df4.info()

In [None]:
df4.head()

> __But first, the `.unique()` method!!!__

In [None]:
for col in df4.columns:
    print(col, df4[col].unique(), '\n')

In [None]:
teams = sorted(list(df4['Club Name'].unique()))
teams

In [None]:
# String replace

df4['Club Name'] = df4['Club Name'].str.replace('Juventus FC', 'Vecchia Signora')

In [None]:
juventus = df4[df4['Club Name'] == 'Vecchia Signora'][['Club Name', 'Market Value Of Club In Millions(£)']]
juventus

---

### Drop duplicates values 

In [None]:
# Drop duplicates

print(df4.shape)
df4.head()

In [None]:
sample = df4[['Club Name', 'Competition Name', 'Squad Size']]
print(sample.shape)
sample.head()

In [None]:
# Eliminate full duplicates columns (i.e.: there's none)

sample = sample.drop_duplicates()
sample.shape

In [None]:
# Check the options: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html

sample = sample.drop_duplicates(subset=['Competition Name', 'Squad Size'],
                                keep='first',
                                ignore_index=False)
sample.shape

In [None]:
# Bear in mind...

sample.index

---

### Apply...your new BFF!!!

In [None]:
# Importing data

df5 = pd.read_csv('datasets/nft_sales.csv') 
# https://www.kaggle.com/hemil26/nft-collections-dataset

df5.info()

In [None]:
df5.head()

In [None]:
# Let's try the long way...

df5_bis = df5.copy()
df5_bis

In [None]:
df5_bis['Sales'] = df5_bis['Sales'].astype('int64')

In [None]:
df5_bis['Sales'] = df5_bis['Sales'].str.replace('$', '')

In [None]:
df5_bis.head()

In [None]:
df5_bis['Sales'] = df5_bis['Sales'].astype('int64')

In [None]:
df5_bis['Sales'] = df5_bis['Sales'].str.replace(',', '', regex=True)

In [None]:
df5_bis.head()

In [None]:
df5_bis['Sales'] = df5_bis['Sales'].astype('int64')

In [None]:
df5_bis.info()

In [None]:
# Now using apply

def clean_money(money):
    pass

In [None]:
df5['Sales'] = df5['Sales'].apply(clean_money)

In [None]:
df5.head()

In [None]:
df5['Sales'] = df5['Sales'].astype('int64')

---

__Other resources to consider:__

- https://github.com/ResidentMario/missingno
