### transformations du dataframe

In [61]:
import pandas as pd
import numpy as np
from time import time
rng = np.random.default_rng(seed=int(time()))
pd.__version__

'2.1.4'

In [62]:
url = "http://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
penguins_df = pd.read_csv(
    url,
    encoding="utf8"
)
penguins_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [63]:
# analyser une portion donnée déterminée par un offset et une limit
def analyse_df(df: pd.DataFrame, offset: int, limit: int):
    """
    df: dataframe ....
    """
    return df.iloc[offset:offset + limit]

print(analyse_df.__annotations__, analyse_df.__doc__)

analyse_df(penguins_df, 100, 20)


{'df': <class 'pandas.core.frame.DataFrame'>, 'offset': <class 'int'>, 'limit': <class 'int'>} 
    df: dataframe ....
    


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
100,Adelie,Biscoe,35.0,17.9,192.0,3725.0,FEMALE
101,Adelie,Biscoe,41.0,20.0,203.0,4725.0,MALE
102,Adelie,Biscoe,37.7,16.0,183.0,3075.0,FEMALE
103,Adelie,Biscoe,37.8,20.0,190.0,4250.0,MALE
104,Adelie,Biscoe,37.9,18.6,193.0,2925.0,FEMALE
105,Adelie,Biscoe,39.7,18.9,184.0,3550.0,MALE
106,Adelie,Biscoe,38.6,17.2,199.0,3750.0,FEMALE
107,Adelie,Biscoe,38.2,20.0,190.0,3900.0,MALE
108,Adelie,Biscoe,38.1,17.0,181.0,3175.0,FEMALE
109,Adelie,Biscoe,43.2,19.0,197.0,4775.0,MALE


#### transformation "arithmétiques"

In [64]:
penguins_df.rename(columns={
    "body_mass_g": "body_mass_kg",
    "flipper_length_mm": "flipper_length_percent"
}, inplace=True)
penguins_df["body_mass_kg"] /= 1000
maxi = penguins_df["flipper_length_percent"].max()
penguins_df["flipper_length_percent"] = np.around((penguins_df["flipper_length_percent"] / maxi * 100), 1)
penguins_df.sort_values(by=["species", "flipper_length_percent"], ascending=[True, False])

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_percent,body_mass_kg,sex
129,Adelie,Torgersen,44.1,18.0,90.9,4.000,MALE
95,Adelie,Dream,40.8,18.9,90.0,4.300,MALE
91,Adelie,Dream,41.1,18.1,88.7,4.300,MALE
101,Adelie,Biscoe,41.0,20.0,87.9,4.725,MALE
90,Adelie,Dream,35.7,18.0,87.4,3.550,FEMALE
...,...,...,...,...,...,...,...
280,Gentoo,Biscoe,45.3,13.8,90.0,4.200,FEMALE
328,Gentoo,Biscoe,43.3,14.0,90.0,4.575,FEMALE
252,Gentoo,Biscoe,45.1,14.5,89.6,5.050,FEMALE
318,Gentoo,Biscoe,48.4,14.4,87.9,4.625,FEMALE


### aggrégats pourcentiles

In [65]:
penguins_df.mean(numeric_only=True)
penguins_df.select_dtypes(include=["float64"]).mean()

bill_length_mm            43.921930
bill_depth_mm             17.151170
flipper_length_percent    86.977193
body_mass_kg               4.201754
dtype: float64

In [66]:
arr = rng.normal(10, 2, size=100)
np.percentile(arr, q=[0, 25, 50, 75, 100])

array([ 5.56457593,  8.27231157,  9.91364302, 11.55218808, 14.61569825])

In [67]:
# je veux les valeurs audessous j'ai un 1/3 (coresp 2/3) les plus petits et au dessus 2/3 (crresp. 1/3) les plus grands
penguins_df.quantile(numeric_only=True, q=[0,0.33,0.66,1])

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_percent,body_mass_kg
0.0,32.1,13.1,74.5,2.7
0.33,40.753,16.153,83.1,3.7
0.66,46.606,18.206,90.5,4.5
1.0,59.6,21.5,100.0,6.3


In [68]:
# décliles => [0, 0.1, 0.2, ..., 1]
list(map(lambda r: r/10, range(11)))
# généralisation de la range
deciles = np.arange(11) / 10
penguins_df["body_mass_kg"].quantile(q=deciles)

0.0    2.700
0.1    3.300
0.2    3.475
0.3    3.650
0.4    3.800
0.5    4.050
0.6    4.300
0.7    4.650
0.8    4.950
0.9    5.400
1.0    6.300
Name: body_mass_kg, dtype: float64

In [69]:
# créer une fonction pour le quartile àà partir de .quantile(0.25)
def quartile(s: pd.Series):
    return s.quantile(0.25)

# créer une fonction de retourner la moyenne des 5 premiers les plus grands
# "fermeture" en python
def avg_top(n: int):
    def f(s: pd.Series):
        return s.sort_values(ascending=False).head(n).mean()
    f.__name__ = f"avg_top_{n}"
    return f

def avg_top_5(s: pd.Series):
    return s.sort_values(ascending=False).head(5).mean()

penguins_df.agg({
    "body_mass_kg": ["mean", np.std],
    # "bill_depth_mm": [avg_top_5, quartile]
    "bill_depth_mm": [avg_top(5), quartile]
})

  penguins_df.agg({


Unnamed: 0,body_mass_kg,bill_depth_mm
mean,4.201754,
std,0.801955,
avg_top_5,,21.22
quartile,,15.6


In [70]:
# volume du bec i.e cône = 1/3 * hauteur * PI * D/2 **2
volume_bill_mm3 = np.around(1/3 * penguins_df["bill_length_mm"] * np.pi * (penguins_df["bill_depth_mm"]/2)**2)
volume_bill_mm3
# penguins_df["volume_bill_mm3"] = volume_bill_mm3

0      3580.0
1      3131.0
2      3418.0
3         NaN
4      3579.0
        ...  
339       NaN
340    2505.0
341    3252.0
342    2592.0
343    3386.0
Length: 344, dtype: float64

In [74]:
# penguins_df["volume_bill_mm3"] = penguins_df.apply(lambda r: 1/3 * r["bill_length_mm"] * np.pi * (r["bill_depth_mm"]/2)**2, axis=1)
penguins_df.insert(4, value=volume_bill_mm3, column="volume_bill_mm3")
penguins_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,volume_bill_mm3,flipper_length_percent,body_mass_kg,sex
0,Adelie,Torgersen,39.1,18.7,3580.0,78.4,3.75,MALE
1,Adelie,Torgersen,39.5,17.4,3131.0,80.5,3.80,FEMALE
2,Adelie,Torgersen,40.3,18.0,3418.0,84.4,3.25,FEMALE
3,Adelie,Torgersen,,,,,,
4,Adelie,Torgersen,36.7,19.3,3579.0,83.5,3.45,FEMALE
...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,,
340,Gentoo,Biscoe,46.8,14.3,2505.0,93.1,4.85,FEMALE
341,Gentoo,Biscoe,50.4,15.7,3252.0,96.1,5.75,MALE
342,Gentoo,Biscoe,45.2,14.8,2592.0,91.8,5.20,FEMALE


### transformation de chaines de caractères

In [84]:
df = pd.read_csv(
    "users.csv",
    sep=";",
    encoding="utf8",
    index_col=0,
    na_values=["???", "--"]
)
df

Unnamed: 0,name,age,address,size
user1,jimmy,28,"2 rue de la rép, 44000 NANTES",1.73
user2,Joan,33,12 bd Haussmann 75009 Paris,
user3,Paul,76,"10, chemin des lilas, 13002 MaRSEILLE",1.85


In [85]:
lastnames = ["Hendrix", "Baez", "McCartney"]
df["name"] = df["name"] + " " + lastnames 

In [90]:
# df[["name", "latname"]] = df["name"].str.split(expand=True)
split_df = df["name"].str.split(expand=True)
df["name"] = split_df[0]
df.rename({"name": "firstname"}, inplace=True)
df.insert(1, "last_name", split_df[1])
df


KeyError: 1

In [92]:
zipcode_patt = "[013456789][0-9]{4}|2[AB0-9][0-9]{3}"
address_df = df["address"].str.extract("(?P<street>.*) (?P<zipcode>"+ zipcode_patt +") (?P<city>.*)")

In [93]:
df.drop(columns="address", inplace=True)
pd.concat([df, address_df], axis=1)

Unnamed: 0,name,last_name,age,size,street,zipcode,city
user1,jimmy,Hendrix,28,1.73,"2 rue de la rép,",44000,NANTES
user2,Joan,Baez,33,,12 bd Haussmann,75009,Paris
user3,Paul,McCartney,76,1.85,"10, chemin des lilas,",13002,MaRSEILLE
