%%

documentation<br>
- quickstart<br>
    - https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html<br>
- interesting (optional) reads<br>
    - performance https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html

In [None]:
import numpy  as np
import pandas as pd

%%<br>
 dataframe creation<br>
manually (rarely used)

In [None]:
df = pd.DataFrame(
    data={
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo"
    }
)

from data/index/columns (sometimes useful if you callect data yourself)

In [None]:
df = pd.DataFrame(
    data    = np.random.random(size=(1000, 4)), # defaults to NaN
    index   = np.arange(1000) * 100,            # defaults to range(data.num_rows)
    columns = ["x", "y", "width", "height"]     # defaults to range(data.num_cols)
)

from file

In [None]:
df = pd.read_csv("data/california_housing_train.csv")

%%<br>
 data exploration

In [None]:
df.head(n=10)           # returns the top `n` rows (defaults to 10)

In [None]:
print(df.head(n=10))    # IMPORTANT! Most pandas operation do NOT work in place,
                        # they return copies. Some operations (like sorting)
                        # have a parameter `inplace` to control this behaviour

In [None]:
df.describe()           # prints useful per-column aggregate data like
                        # min/max, avg and so on

%%<br>
 data selection

In [None]:
df["longitude"]   # returns a column
print(df.loc[0])  # returns the row with index value 0 (might not be the first)
print(df.iloc[0]) # returns the first row of the dataset (might not have index value 0)

In [None]:
filter = df["longitude"] < df["longitude"].mean() # checks which rows match the given expression
df[filter]                                        # filters dataset based on the condition
df[df["median_income"] > 3]                       # works also inlined

%%<br>
 simple operations

In [None]:
df.mean()                   # the mean of all columns
df["median_income"].std()   # standard deviation of the `median_income` column

many things going on with this row

In [None]:
df["mean_household_occupant"] = df["population"] / df["households"]
# 1. we are creating a new column on the fly
# 2. we are performing an element-wise division between the columns `population` and `household`,
#    numpy style
# 3. we are living a fantasy where everyone has a home

In [None]:
latitude_threshold = df["latitude"].mean()
df["latitude_categorical"] = df["latitude"].map(lambda x: "South" if x < latitude_threshold else "North")
# %%
## database-style operations

In [None]:
df.groupby("latitude_categorical")[["latitude", "longitude"]].aggregate(["count", "median", "mean", "std"])