### Introduction à PANDAS

In [2]:
import pandas as pd
import numpy as np
from time import time
rng = np.random.default_rng(seed=int(time()))
pd.__version__


'2.1.4'

#### instanciation d'un Dataframe Pandas

In [3]:
data = [
    ["jimmy", 28, "2 rue de la rép, 44000 NANTES", 1.73],
    ["Joan", 33, "12 bd Haussmann 75009 Paris", 1.56],
    ["Paul", 76, "10, chemin des lilas, 13002 MaRSEILLE", 1.85],
]

columns = ["name", "age", "address", "size"]
index = ["user1", "user2", "user3"]

In [39]:
df = pd.DataFrame(
    data=data,
    columns=columns,
    index=index
)
# print(df.ndim, df.shape, df.size)
# print(df.dtypes)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, user1 to user3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     3 non-null      object 
 1   age      3 non-null      int64  
 2   address  3 non-null      object 
 3   size     3 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 120.0+ bytes


In [24]:
# column set
col_data = np.array(data).T.tolist()
col_set = dict(zip(columns, col_data))
col_set
pd.DataFrame(data=col_set, index=index)

Unnamed: 0,name,age,address,size
user1,jimmy,28,"2 rue de la rép, 44000 NANTES",1.73
user2,Joan,33,12 bd Haussmann 75009 Paris,1.56
user3,Paul,76,"10, chemin des lilas, 13002 MaRSEILLE",1.85


In [38]:
# records => objets json => dict
records = np.apply_along_axis(
    lambda row: dict(zip(columns, row)),
    arr=data,
    axis=1
)
df = pd.DataFrame.from_records(
    records, 
    # index=index
)
df

Unnamed: 0,name,age,address,size
0,jimmy,28,"2 rue de la rép, 44000 NANTES",1.73
1,Joan,33,12 bd Haussmann 75009 Paris,1.56
2,Paul,76,"10, chemin des lilas, 13002 MaRSEILLE",1.85


#### écriture vs lecture

In [26]:
df.to_csv(
    "users.csv",
    sep=";",
    encoding="utf8",
    # index non signifiant
    # index=False
)

In [33]:
df = pd.read_csv(
    "users.csv",
    sep=";",
    encoding="utf8",
    # retrouver l'index
    index_col=0,
    # usecols=(0,1,2,4)
    usecols=list(range(3)) + [4]
)
df

Unnamed: 0,name,age,size
user1,jimmy,28,1.73
user2,Joan,33,1.56
user3,Paul,76,1.85


In [43]:
df.to_json(
    "users.json",
    ## objets json-> dict
    orient="records",
    ## dispatch index, colonnes, liste de lignes
    # orient="split"
    ## set de lignes
    # orient="index"
    ##
    # orient="table",
    force_ascii=False,
    indent=2
)