### transformations du dataframe

In [None]:
import pandas as pd
import numpy as np
from time import time
rng = np.random.default_rng(seed=int(time()))
pd.__version__

In [None]:
url = "http://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
penguins_df = pd.read_csv(
    url,
    encoding="utf8"
)
penguins_df

In [None]:
# analyser une portion donnée déterminée par un offset et une limit
def analyse_df(df: pd.DataFrame, offset: int, limit: int):
    """
    df: dataframe ....
    """
    return df.iloc[offset:offset + limit]

print(analyse_df.__annotations__, analyse_df.__doc__)

analyse_df(penguins_df, 100, 20)


#### transformation "arithmétiques"

In [None]:
penguins_df.rename(columns={
    "body_mass_g": "body_mass_kg",
    "flipper_length_mm": "flipper_length_percent"
}, inplace=True)
penguins_df["body_mass_kg"] /= 1000
maxi = penguins_df["flipper_length_percent"].max()
penguins_df["flipper_length_percent"] = np.around((penguins_df["flipper_length_percent"] / maxi * 100), 1)
penguins_df.sort_values(by=["species", "flipper_length_percent"], ascending=[True, False])

### aggrégats pourcentiles

In [None]:
penguins_df.mean(numeric_only=True)
penguins_df.select_dtypes(include=["float64"]).mean()

In [None]:
arr = rng.normal(10, 2, size=100)
np.percentile(arr, q=[0, 25, 50, 75, 100])

In [None]:
# je veux les valeurs audessous j'ai un 1/3 (coresp 2/3) les plus petits et au dessus 2/3 (crresp. 1/3) les plus grands
penguins_df.quantile(numeric_only=True, q=[0,0.33,0.66,1])

In [None]:
# décliles => [0, 0.1, 0.2, ..., 1]
list(map(lambda r: r/10, range(11)))
# généralisation de la range
deciles = np.arange(11) / 10
penguins_df["body_mass_kg"].quantile(q=deciles)

In [None]:
# créer une fonction pour le quartile àà partir de .quantile(0.25)
def quartile(s: pd.Series):
    return s.quantile(0.25)

# créer une fonction de retourner la moyenne des 5 premiers les plus grands
# "fermeture" en python
def avg_top(n: int):
    def f(s: pd.Series):
        return s.sort_values(ascending=False).head(n).mean()
    f.__name__ = f"avg_top_{n}"
    return f

def avg_top_5(s: pd.Series):
    return s.sort_values(ascending=False).head(5).mean()

penguins_df.agg({
    "body_mass_kg": ["mean", np.std],
    # "bill_depth_mm": [avg_top_5, quartile]
    "bill_depth_mm": [avg_top(5), quartile]
})

In [None]:
# volume du bec i.e cône = 1/3 * hauteur * PI * D/2 **2
volume_bill_mm3 = np.around(1/3 * penguins_df["bill_length_mm"] * np.pi * (penguins_df["bill_depth_mm"]/2)**2)
volume_bill_mm3
# penguins_df["volume_bill_mm3"] = volume_bill_mm3

In [None]:
## méthode APPLY
penguins_df["volume_bill_mm3"] = penguins_df.apply(lambda r: 1/3 * r["bill_length_mm"] * np.pi * (r["bill_depth_mm"]/2)**2, axis=1)
penguins_df.insert(4, value=volume_bill_mm3, column="volume_bill_mm3")
penguins_df

### transformation de chaines de caractères

In [None]:
df = pd.read_csv(
    "users.csv",
    sep=";",
    encoding="utf8",
    index_col=0,
    na_values=["???", "--"]
)
df

In [None]:
lastnames = ["Hendrix", "Baez", "McCartney"]
df["name"] = df["name"] + " " + lastnames 

In [None]:
# df[["name", "latname"]] = df["name"].str.split(expand=True)
split_df = df["name"].str.split(expand=True)
df["name"] = split_df[0]
df.rename({"name": "firstname"}, inplace=True)
df.insert(1, "last_name", split_df[1])
df


In [None]:
zipcode_patt = "[013456789][0-9]{4}|2[AB0-9][0-9]{3}"
address_df = df["address"].str.extract("(?P<street>.*) (?P<zipcode>"+ zipcode_patt +") (?P<city>.*)")

In [None]:
df.drop(columns="address", inplace=True)
pd.concat([df, address_df], axis=1)