# Pandas Data Cleaning

Some methods: `.isnull()`, `.drop()`, `.fillna()`, `._get_numeric_data()`, `.astype()`, `.str.replace()`, `.drop_duplicates()`, `.apply()`

In [None]:
import pandas as pd

In [None]:
# Importing data

df1 = pd.read_csv('datasets/transaction_dataset.csv') # https://www.kaggle.com/vagifa/ethereum-frauddetection-dataset
df1.head()

In [None]:
# Explore dataset

df1.info()

In [None]:
# Null analysis

df1.isnull()

In [None]:
df1.isnull().sum()

In [None]:
null_cols = df1.isnull().sum()
null_cols[null_cols > 0]

---

In [None]:
# Null analysis deep dive

null_cols = null_cols[null_cols > 0] / len(df1) * 100
null_cols

In [None]:
threshold = 5
null_filter = null_cols > threshold
drop_cols = list(null_cols[null_filter].index)
drop_cols

In [None]:
remain_columns = df1.columns.difference(drop_cols)
remain_columns

---

In [None]:
# Drop columns

df_no_nulls = df1.drop(drop_cols, axis=1)
df_no_nulls.info()

---

In [None]:
# Filling nulls

df2 = pd.read_csv('datasets/vehicles.csv')
df2.info()

In [None]:
null_displ = df2.loc[df2['displ'].isnull(), ['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
null_displ

In [None]:
df2[['displ', 'cylinders']] = df2[['displ', 'cylinders']].fillna(0)

In [None]:
zero_displ = df2[df2['displ'] == 0]
zero_displ = zero_displ[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
zero_displ

In [None]:
# Solving specific error

error = df2[(df2['cylinders']==0) & (df2['displ']!=0)]
error[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

In [None]:
df2.loc[(df2['cylinders']==0) & (df2['displ']!=0), 'cylinders'] = 4

In [None]:
df2[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']].iloc[21506]

---

In [None]:
# Get numeric data

# df1[[x for x in data.columns if x not in data._get_numeric_data().columns]]
# df1.drop(df1._get_numeric_data().columns, axis=1)
df1._get_numeric_data()

---

In [None]:
# Changing data types

df3 = pd.read_csv('datasets/power_consumption_es.csv') # https://www.kaggle.com/francoisraucent/western-europe-power-consumption
df3.info()

In [None]:
df3.head()

In [None]:
df3['start'] = df3['start'].astype('datetime64[ns]')
df3['end'] = df3['end'].astype('datetime64[ns]')

In [None]:
df3.info()

---

In [None]:
# Replacing strings

df4 = pd.read_csv('datasets/club.csv') # https://www.kaggle.com/sanjeetsinghnaik/football-club-market-value-2021
df4.info()

In [None]:
df4.head()

In [None]:
teams = sorted(list(df4['Club Name'].unique()))
teams

In [None]:
df4['Club Name'] = df4['Club Name'].str.replace('Juventus FC', 'Vecchia Signora')

In [None]:
juventus = df4[df4['Club Name'] == 'Vecchia Signora']
juventus[['Club Name', 'Market Value Of Club In Millions(Â£)']]

---

In [None]:
# Drop duplicates

df4.shape

In [None]:
sample = df4[['Club Name', 'Competition Name', 'Squad Size']]
print(sample.shape)
sample.head()

In [None]:
sample = sample.drop_duplicates()
sample.shape

In [None]:
sample = sample.drop_duplicates(subset=['Competition Name', 'Squad Size'], keep='first', ignore_index=False)
sample.shape

In [None]:
sample.index

---

In [None]:
# Apply

df5 = pd.read_csv('datasets/nft_sales.csv') # https://www.kaggle.com/hemil26/nft-collections-dataset
df5.info()

In [None]:
df5.head()

In [None]:
df5['Sales'] = df5['Sales'].astype('int64')

In [None]:
df5['Sales'] = df5['Sales'].str.replace('$', '')

In [None]:
df5.head()

In [None]:
df5['Sales'] = df5['Sales'].astype('int64')

In [None]:
df5['Sales'] = df5['Sales'].str.replace(',', '', regex=True)

In [None]:
df5.head()

In [None]:
df5['Sales'] = df5['Sales'].astype('int64')

In [None]:
df5.info()

In [None]:
def clean_money(money):
    pass

In [None]:
df5['Sales'] = df5['Sales'].apply(clean_money)

In [None]:
df5.head()

---

__Other resources to consider:__

- https://github.com/ResidentMario/missingno

- https://pyod.readthedocs.io/en/latest/