In [2]:
import pandas as pd
from os import path

In [3]:
def get_geo_tuple(row):
    return (row["geo_point_2d_a"], row["geo_point_2d_b"])

def get_mesures_tuple(row):
    return (row["circonference_cm"], row["hauteur_m"])

def better_name_developpement(row):
    match row["stade_developpement"]:
        case "U":
            return "Inconnu"
        case "A":
            return "Adulte"
        case "M":
            return "Mature"
        case 'J':
            return "Jeune"
        case "JA":
            return "Jeune Adulte"


In [4]:
df = pd.read_csv(path.join("..","data/raw","paris_threes_raw.csv"), sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200137 entries, 0 to 200136
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   200137 non-null  int64  
 1   type_emplacement     200137 non-null  object 
 2   domanialite          200136 non-null  object 
 3   arrondissement       200137 non-null  object 
 4   complement_addresse  30902 non-null   object 
 5   numero               0 non-null       float64
 6   lieu                 200137 non-null  object 
 7   id_emplacement       200137 non-null  object 
 8   libelle_francais     198640 non-null  object 
 9   genre                200121 non-null  object 
 10  espece               198385 non-null  object 
 11  variete              36777 non-null   object 
 12  circonference_cm     200137 non-null  int64  
 13  hauteur_m            200137 non-null  int64  
 14  stade_developpement  132932 non-null  object 
 15  remarquable      

In [5]:
df.head(2)

Unnamed: 0,id,type_emplacement,domanialite,arrondissement,complement_addresse,numero,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
0,99874,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,19,Marronnier,Aesculus,hippocastanum,,20,5,,0.0,48.85762,2.320962
1,99875,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,20,If,Taxus,baccata,,65,8,A,,48.857656,2.321031


# First impressions 

The dataset is composed of **18 features** & **200 137 instances**.  
Each observation contains an datas about a three : what is the three (descriptive) and where it is (geographic information).  
We have a lot of null values, and some columns seems to be useless (unique values or mergeable), let's do some exploration.

In [6]:
print(df["type_emplacement"].unique())
df.drop(["type_emplacement"], axis=1, inplace=True)
df.info()

['Arbre']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200137 entries, 0 to 200136
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   200137 non-null  int64  
 1   domanialite          200136 non-null  object 
 2   arrondissement       200137 non-null  object 
 3   complement_addresse  30902 non-null   object 
 4   numero               0 non-null       float64
 5   lieu                 200137 non-null  object 
 6   id_emplacement       200137 non-null  object 
 7   libelle_francais     198640 non-null  object 
 8   genre                200121 non-null  object 
 9   espece               198385 non-null  object 
 10  variete              36777 non-null   object 
 11  circonference_cm     200137 non-null  int64  
 12  hauteur_m            200137 non-null  int64  
 13  stade_developpement  132932 non-null  object 
 14  remarquable          137039 non-null  float64
 15  geo_poi

"type_emplacement" was the only feature with only one distinct value.  
Now let's look at some column we can tune and transform.

In [7]:

# df["geo_points"] = df.apply(get_geo_tuple, axis=1)
# df["mesurement_cm"] = df.apply(get_mesures_tuple, axis=1)
# df.drop(["geo_point_2d_a","geo_point_2d_b", "circonference_cm", "hauteur_m"],axis=1,inplace=True)
# df.head(2)

#Power BI can't opperate with tuple 

In [8]:
print(df["stade_developpement"].unique())

[nan 'A' 'J' 'M' 'JA']


# Valeurs nulls
Maintenant nous allons réfléchir à comment combler les valeurs nulles.

In [9]:
df["stade_developpement"].fillna("U", inplace=True)
df["stade_developpement"] = df.apply(better_name_developpement, axis=1)

In [10]:
df.drop(["numero"], axis=1, inplace=True)

In [11]:
df["complement_addresse"].fillna(" ", inplace=True)

In [12]:
df["libelle_francais"].fillna("Inconnu", inplace=True)
df["genre"].fillna("Inconnu", inplace=True)
df["espece"].fillna("Inconnu", inplace=True)
df["variete"].fillna("Inconnu", inplace=True)

In [13]:
df["remarquable"].fillna(0, inplace=True)

In [14]:
lieu_missing = df.loc[df["domanialite"].isnull()]["lieu"].values[0]
print(df[df["lieu"]==lieu_missing]["domanialite"].unique())
# It is always "jardin", so we can transform it
df["domanialite"].fillna("Jardin", inplace=True)


['Jardin' nan]


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200137 entries, 0 to 200136
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   200137 non-null  int64  
 1   domanialite          200137 non-null  object 
 2   arrondissement       200137 non-null  object 
 3   complement_addresse  200137 non-null  object 
 4   lieu                 200137 non-null  object 
 5   id_emplacement       200137 non-null  object 
 6   libelle_francais     200137 non-null  object 
 7   genre                200137 non-null  object 
 8   espece               200137 non-null  object 
 9   variete              200137 non-null  object 
 10  circonference_cm     200137 non-null  int64  
 11  hauteur_m            200137 non-null  int64  
 12  stade_developpement  200137 non-null  object 
 13  remarquable          200137 non-null  float64
 14  geo_point_2d_a       200137 non-null  float64
 15  geo_point_2d_b   

In [16]:
df.head(5)

Unnamed: 0,id,domanialite,arrondissement,complement_addresse,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
0,99874,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,19,Marronnier,Aesculus,hippocastanum,Inconnu,20,5,Inconnu,0.0,48.85762,2.320962
1,99875,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,20,If,Taxus,baccata,Inconnu,65,8,Adulte,0.0,48.857656,2.321031
2,99876,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,21,If,Taxus,baccata,Inconnu,90,10,Adulte,0.0,48.857705,2.321061
3,99877,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,22,Erable,Acer,negundo,Inconnu,60,8,Adulte,0.0,48.857722,2.321006
4,99878,Jardin,PARIS 17E ARRDT,,PARC CLICHY-BATIGNOLLES-MARTIN LUTHER KING,000G0037,Arbre à miel,Tetradium,daniellii,Inconnu,38,0,Inconnu,0.0,48.890435,2.315289


In [17]:
df.to_csv(path.join("..","data/transform","paris_threes_transformed.csv"), index=False)