<a href="https://colab.research.google.com/github/daniel-falk/ai-ml-principles-exercises/blob/main/ML-training/intro-to-libraries/intro_to_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas, a library for tabular data
`pandas` is a library that is used to handle tabular data, read and write data files, etc.

In [None]:
import pandas
import numpy as np

In [None]:
# A pandas data frame can be used to name columns with different data
x = np.linspace(start=0, stop=2*np.pi, num=10)
y = np.sin(x)

pandas.DataFrame(
    {
        "x": x,
        "sin(x)": y,
        "is positive": y >= 0
    }
)

In [None]:
# Pandas can also be used to read e.g. CSV data files

csv_data = """
Name,Age,Height,Gender,Hobby
Daniel F.,31,179,male,sailing
Jennifer H.,41,163,female,running
Patrik K.,19,192,male,painting  
"""

In [None]:
from io import StringIO

csv_file = StringIO(csv_data)

In [None]:
table = pandas.read_csv(csv_file)
table

In [None]:
table.dtypes

In [None]:
table.head(1)

In [None]:
table.tail(2)

In [None]:
table.sort_values(by="Height")

# Indexing values in a data frame
The frame (table) can be indexed by row or column to extract a smaller frame.

In [None]:
table[["Name", "Age"]]

In [None]:
table["Age"] > 20

In [None]:
table[table["Age"] > 20]

In [None]:
# iloc can be used the address values based on index
table.iloc[0,0]

In [None]:
table[table["Hobby"].isin(["sailing", "running"])]

# Statistics on tables
`pandas` contains many methods to calculate statistical information about table rows and columns.

In [None]:
table.describe()

In [None]:
table.groupby(["Gender"]).mean()

In [None]:
table

In [None]:
# Count how many unique values there are in each column
table.apply(lambda x: len(np.unique(x))).to_frame()

# Modifying the data
The data in a frame (table) can be modified.

In [None]:
table.loc[0, "Name"] = "Anders F."
table

In [None]:
# Let's assume that we know that the age of Jennifer is incorrect,
# we don't know the true age so instead we set it to a not a number value
table.loc[table["Name"] == "Jennifer H.", "Age"] = np.nan
table

In [None]:
# Later we can deal with NaN numbers by replacing (impute) the or by
# dropping all the rows containing NaN values
table.dropna(inplace=True)
table