# Cleaning data

## Remove Rows that contain empty cells

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")

df.dropna(inplace = True)

print(df.to_string())

## Replace Empty Values

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")

# Replace all empty cells with the number 130
df.fillna(130, inplace = True)

# Replace NULL values in the "Calories" columns with the number 130
df["Calories"].fillna(130, inplace = True)

print(df.to_string())

## Get Mean, Median, or Mode value

- Mean: the average value (the sum of all values divided by number of values).
- Median = the value in the middle, after you have sorted all values ascending.
- Mode = the value that appears most frequently.

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")
x = df["Calories"].mean()
y = df["Calories"].median()
z = df["Calories"].mode()[0]
print(x, y, z)

## Convert column to Date Format

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")

df['Date'] = pd.to_datetime(df['Date'])

print(df.to_string())

## Fixing Wrong Data

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")

# Replacing Values
df.loc[7, 'Duration'] = 45

# Replace with some boundaries for legal values
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120
    
# Removing Rows
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

## Removing Duplicates

In [None]:
import pandas as pd

df = pd.read_csv('data.csv', sep=";")

# Returns True for every row that is a duplicate, othwerwise False:
print(df.duplicated())

df.drop_duplicates(inplace = True)