# Data Preprocessing / Cleaning 
- Renaming variables
- Data type conversion
- Encoding values
- Merging data sets
- Converting units
- Handling missing data
- Handling outliers

In [None]:
import pandas as pd

In [None]:
# Colnames 
col_names = ["Class","AGE","SEX","STEROID","ANTIVIRALS","FATIGUE","MALAISE","ANOREXIA","LIVER BIG",
             "LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITES","VARICES","BILIRUBIN","ALK PHOSPHATE",
             "SGOT","ALBUMIN","PROTIME","HISTOLOGY"]
df = pd.read_csv('../data/hepatitis.data', names = col_names)

## Exploring Data 

In [None]:
df.head()

In [None]:
df.tail() 

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info() 

## Handling Columns 

In [None]:
df.head() 

In [None]:
df.columns

In [None]:
df.columns.str.lower() 

In [None]:
df.columns.str.title()

In [None]:
df.columns.str.capitalize() 

In [None]:
df.columns.str.replace(" ", "_")

In [None]:
df.columns.str.lower().str.replace(" ", "_")

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [None]:
df.columns

In [None]:
df.columns[df.dtypes == 'object']

In [None]:
sum(df.dtypes == 'object')

In [None]:
df.columns[df.dtypes != 'object']

In [None]:
sum(df.dtypes != 'object')

## Renaming Columns and Recoding

In [None]:
df = pd.read_csv("../data/500_Person_Gender_Height_Weight_Index.csv")

In [None]:
df.head() 

In [None]:
df.tail() 

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info() 

In [None]:
df.Gender.unique()

In [None]:
df.Gender.value_counts()

In [None]:
df.Gender.value_counts(normalize=True)

In [None]:
# recoding 
df.Gender.replace(to_replace = ["Male", "Female"], value = [1, 0], inplace = True)

In [None]:
df.Gender.value_counts()

In [None]:
df.head() 

In [None]:
df.rename(columns = {"Index": "BMICat"}, inplace = True)

In [None]:
df.head() 

In [None]:
df.BMICat.value_counts()

## Changing DataType with astype() / pd.to_numeric / pd.to_datetime

In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv('../data/covid19.csv')

In [None]:
df.head()

In [None]:
df.tail() 

In [None]:
df.info()

In [None]:
df['Confirmed'].astype('int').dtype

In [None]:
pd.to_numeric(df.Confirmed).dtype

In [None]:
df['Last Update'].dtypes 

In [None]:
pd.to_datetime(df['Last Update'])

### Intro to NA Values

In [None]:
import numpy as np
import pandas as pd 

In [None]:
# Colnames 
df = pd.read_csv('../data/covid19.csv')

In [None]:
df.head() 

In [None]:
df.info()

In [None]:
# check missing values 
df.isnull() 

In [None]:
df.isnull().sum() 

In [None]:
df.isna()

In [None]:
df.isna().sum(axis = 0)

In [None]:
df.isna().any(axis = 1)

In [None]:
df[df.isna().any(axis = 1)]

In [None]:
df.notna()

In [None]:
df.notna().sum(axis = 1)

In [None]:
df.notna().all(axis = 0)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(df.notna())
plt.show()

In [None]:
df['Province/State'].value_counts(dropna = False)

In [None]:
df['Province/State'].replace(to_replace= "Missing Data", value = np.nan, inplace= True)

In [None]:
df.info()

In [None]:
df = pd.read_csv('../data/titanic.csv')

In [None]:
df.info() 

In [None]:
df.isnull() 

In [None]:
df.isnull().sum() 

In [None]:
df['age'].replace(to_replace= "Missing Data", value = np.nan, inplace= True)
df.head() 

In [None]:
df.isnull().sum() 

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(df.notna())
plt.show()

In [None]:
df.notna().sum(axis = 1)

In [None]:
df.notna().all(axis = 0)

### Removing Missing Values with dropna()

In [None]:
df = pd.read_csv('../data/covid19.csv')

In [None]:
df.info()

In [None]:
df[df['Province/State'].isna()]

In [None]:
df['Province/State'].value_counts(dropna = False)

In [None]:
df.shape

In [None]:
df.dropna(inplace=True) 

In [None]:
df.isnull().sum() 

In [None]:
df.dropna(axis = 0, how = "any").shape

In [None]:
df.dropna(axis = 1, how = "any").shape

In [None]:
df.dropna(axis = 0, how = "all").shape

In [None]:
df.dropna(axis = 1, how = "all").shape

In [None]:
df.dropna(axis = 0, thresh = 8).shape

In [None]:
df.dropna(axis = 1, thresh = 500).shape

In [None]:
df.dropna(axis = 1, thresh = 500, inplace = True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dropna(axis = 0, subset = ["Confirmed", "Deaths", "Recovered"], how = "any").shape

### Replacing Missing Values with fillna()

In [None]:
titanic = pd.read_csv('../data/titanic.csv')

In [None]:
titanic.head(10)

In [None]:
titanic.info()

In [None]:
titanic.age.mean()

In [None]:
mean = round(titanic.age.mean(),1)
mean

In [None]:
titanic.age.fillna(mean, inplace = True)

In [None]:
titanic.isnull().sum() 

In [None]:
median = round(titanic.age.median(),1)
median

In [None]:
titanic.age.fillna(median, inplace = True)

In [None]:
titanic.age.fillna(2, inplace = True)

In [None]:
titanic

In [None]:
titanic.isnull().sum() 

### Detection of Duplicates

In [None]:
alphabet = pd.DataFrame(["a", "b", "c", "c", "d", "e", "f", "g", "g", "g"], columns = ["Alphabet"])

In [None]:
alphabet

In [None]:
alphabet.duplicated()

In [None]:
alphabet.duplicated(keep = "first")

In [None]:
alphabet.duplicated(keep = "last")

In [None]:
alphabet[alphabet.duplicated(keep = "first")]

### Handling / Removing Duplicates

In [None]:
df = pd.read_csv('../data/titanic.csv')

In [None]:
df.tail()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates() 

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True) 

In [None]:
df.duplicated().sum()

### Categorical Data

In [None]:
df = pd.read_excel('../data/LungCapData.xls')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#titanic.to_csv("titanic_clean.csv", index = False)

In [None]:
df.nunique()

In [None]:
df[["Gender", "Smoke"]].describe()

In [None]:
df.Gender = df.Gender.astype("category")

In [None]:
df.Smokes = df.Smoke.astype("category")

In [None]:
df.info()