In [None]:
import pandas as pd
import numpy as np

# Load data

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = pd.read_csv(url, header=None, names=['sepal_length','sepal_width', 'petal_length', 'petal_width', 'class'])

iris.head()

# Data cleaning

## Missing values

### Is there any missing value in the dataframe?

In [None]:
iris.isnull().any().any()

### Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [None]:
iris.iloc[10:30,2] = np.nan

### Which column has the maximum number of missing values?

In [None]:
iris.columns[iris.isna().sum().argmax()]

### Try to substitute the NaN values with two methods:
- replace null values with column mean (apply it to a copy of the dataframe)
- replace null values with 1.0



In [None]:
iris_copy = iris.copy(deep=True)
iris_copy.fillna(iris_copy.mean(numeric_only=True))


iris.fillna(1.0, inplace=True)

### Set the first 3 rows as NaN

In [None]:
iris.iloc[0:3,:] = np.nan
iris

### Delete the rows that have all NaN

In [None]:
iris.dropna(how='all', inplace=True)

### Reset the index so it begins with 0 again

In [None]:
iris.reindex([i for i in range(147)])

## Duplicates

### Does the dataframe contain duplicated rows? If any, visualize all duplicated rows (don't omit first or last occurrences)

In [None]:
iris_duplicate_rows = iris[iris.duplicated(keep=False)]
iris_duplicate_rows

### Which row is the most repeated?

In [None]:
iris_duplicate_rows.groupby(list(iris_duplicate_rows.columns.values)).size()

### Drop duplicated rows

In [None]:
iris.drop_duplicates()

## Detect outliers, e.g., values that are higher than 85th percentile and lower than 25th percentile.

In [None]:
iris_copy = iris.loc[:,'sepal_length':'petal_width'].copy()
iris_copy[(iris_copy > iris_copy.quantile(0.85)).all(axis=1)]
iris_copy[(iris_copy < iris_copy.quantile(0.25)).all(axis=1)]

# Data transformation

## Replace class values by removing "Iris-" prefix (use a dictionary)

In [None]:
map_classes = {
    'Iris-setosa': 'setosa',
    'Iris-versicolor': 'versicolor',
    'Iris-virginica': 'virginica'
}

#class_map_values = {}
#for class_val in iris['class'].unique():
#  class_map_values[class_val] = class_val.replace("Iris-", "")

iris['class'] =  iris['class'].replace(map_classes)
iris

## Delete columns
Delete for example class column

In [None]:
iris.drop('class',axis=1, inplace=True)

## How to normalize all columns in a dataframe?
- Normalize all columns of df by subtracting the column mean and divide by standard deviation.
- Range all columns of df such that the minimum value in each column is 0 and max is 1.

In [None]:
iris_mean = iris.mean(numeric_only=True)
iris_std = iris.std(numeric_only=True)

out1 = ((iris - iris_mean) / iris_std).round(2)
out1

# alternative
out1 = iris.apply(lambda x: ((x - x.mean())/x.std()).round(2))
out1



In [None]:
out2 = ((iris.max() - iris) / (iris.max() - iris.min())).round(2)
out2

# alternative 
out2 = iris.apply(lambda x: ((x.max() - x)/(x.max() - x.min())).round(2))
out2

## Binning and discretization
Discretize dataframe columns in 4 bins and get the new value frequency distribution

In [None]:
discrete_iris = iris.apply(lambda x: pd.cut(x,4))
discrete_iris

In [None]:

for i in range(4):
    print(discrete_iris.iloc[:,i].unique())
    print(discrete_iris.iloc[:,i].value_counts(normalize=True))

## Binarize categorical data (dummy variables)
Based on the prevoius result, binarize all dataframe columns

In [None]:
pd.get_dummies(discrete_iris)