In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Ouverture du fichier csv
chemin_fichier_csv = r'./champignons.csv'
champignons = pd.read_csv(chemin_fichier_csv)

print(champignons.head())

  TYPE         COLOR       SHAPE     SURFACE
0    I          Pale    Polypore      Smooth
1    E         White      Convex      Smooth
2    E  White-Yellow      Convex      Smooth
3    E         Brown  Bellshaped  FlatScales
4    E           NaN      Convex     Fibrous


In [3]:
# Modification des valeur de la collone TYPE
print(champignons["TYPE"].value_counts(dropna=False))

champignons["TYPE"] = champignons["TYPE"].replace({"E": 0, "I": 1, "P": 2})
champignons["TYPE"] = champignons["TYPE"].fillna(-1)

print(champignons.head())
print(champignons["TYPE"].value_counts(dropna=False))

TYPE
E      515
P      405
I      228
NaN     36
Name: count, dtype: int64
   TYPE         COLOR       SHAPE     SURFACE
0   1.0          Pale    Polypore      Smooth
1   0.0         White      Convex      Smooth
2   0.0  White-Yellow      Convex      Smooth
3   0.0         Brown  Bellshaped  FlatScales
4   0.0           NaN      Convex     Fibrous
TYPE
 0.0    515
 2.0    405
 1.0    228
-1.0     36
Name: count, dtype: int64


In [4]:
# Tranformation des colonnes catégorielles(SHAPE/SURFACE) en colonnes binaires
def ajout_indicateur_colonne(df, nom_colonne):
    valeur_unique = pd.unique(df[nom_colonne].str.split("-").explode().dropna())
    
    for value in valeur_unique:
        df[f"{nom_colonne}_{value}"] = df[nom_colonne].str.contains(value, na=False).astype(int)

ajout_indicateur_colonne(champignons, "SHAPE")
ajout_indicateur_colonne(champignons, "SURFACE")

champignons = champignons.drop(['SHAPE', 'SURFACE'], axis=1)

print(champignons.head())

   TYPE         COLOR  SHAPE_Polypore  SHAPE_Convex  SHAPE_Bellshaped  \
0   1.0          Pale               1             0                 0   
1   0.0         White               0             1                 0   
2   0.0  White-Yellow               0             1                 0   
3   0.0         Brown               0             0                 1   
4   0.0           NaN               0             1                 0   

   SHAPE_Depressed  SHAPE_CupFungi  SHAPE_CoralFungi  SHAPE_Conical  \
0                0               0                 0              0   
1                0               0                 0              0   
2                0               0                 0              0   
3                0               0                 0              0   
4                0               0                 0              0   

   SHAPE_SiddarthMachado  ...  SURFACE_Velvety  SURFACE_AlanRockefeller  \
0                      0  ...                0             

In [5]:
# Liste des couleurs individuelles présente dans le jeu de données 

couleurs_individuelles = pd.unique(champignons["COLOR"].str.split("-").explode().dropna())

print(couleurs_individuelles)

print(f"Nombre total de couleurs uniques : {len(couleurs_individuelles)}")

['Pale' 'White' 'Yellow' 'Brown' 'Pink' 'Purple' 'Tan' 'Orange' 'Gray'
 'Red' 'Black' 'Blue' 'Green' 'Violet' 'Lilac']
Nombre total de couleurs uniques : 15


In [6]:
# Création d'un dataframe de toutes les couleurs individuelles

colors_list = [
    {"Color": "Pale", "R": 240, "G": 221, "B": 215},
    {"Color": "White", "R": 255, "G": 255, "B": 255},
    {"Color": "Yellow", "R": 255, "G": 255, "B": 0},
    {"Color": "Brown", "R": 165, "G": 42, "B": 42},
    {"Color": "Pink", "R": 255, "G": 192, "B": 203},
    {"Color": "Purple", "R": 128, "G": 0, "B": 128},
    {"Color": "Tan", "R": 210, "G": 180, "B": 140},
    {"Color": "Orange", "R": 255, "G": 165, "B": 0},
    {"Color": "Gray", "R": 128, "G": 128, "B": 128},
    {"Color": "Red", "R": 255, "G": 0, "B": 0},
    {"Color": "Black", "R": 0, "G": 0, "B": 0}, #Dark
    {"Color": "Green", "R": 0, "G": 128, "B": 0},
    {"Color": "Blue", "R": 0, "G": 0, "B": 255},
    {"Color": "Violet", "R": 238, "G": 130, "B": 238},
    {"Color": "Lilac", "R": 200, "G": 162, "B": 200}
]

colors_df = pd.DataFrame(colors_list)

print(colors_df)

     Color    R    G    B
0     Pale  240  221  215
1    White  255  255  255
2   Yellow  255  255    0
3    Brown  165   42   42
4     Pink  255  192  203
5   Purple  128    0  128
6      Tan  210  180  140
7   Orange  255  165    0
8     Gray  128  128  128
9      Red  255    0    0
10   Black    0    0    0
11   Green    0  128    0
12    Blue    0    0  255
13  Violet  238  130  238
14   Lilac  200  162  200


In [7]:
# Création d'un dataframe de toutes les couleurs conbinées ( ex red + green )

couleurs_combinees = champignons[champignons['COLOR'].str.count('-') > 0]
combinaisons_uniques = couleurs_combinees['COLOR'].unique()
colors = pd.DataFrame({'Combined_Color': combinaisons_uniques})

print(colors)

   Combined_Color
0    White-Yellow
1     White-Brown
2      Pink-Brown
3    Purple-Brown
4      White-Pale
..            ...
90     Green-Pink
91     Gray-White
92       Red-Pale
93    Gray-Yellow
94     Purple-Red

[95 rows x 1 columns]


In [8]:
# Jointure de couleurs unique pour afficher les différentes couleurs conbinée

colors['Color1'] = colors['Combined_Color'].str.split('-').str.get(0)
colors['Color2'] = colors['Combined_Color'].str.split('-').str.get(1)

merge_1 = colors.merge(colors_df, left_on='Color1', right_on='Color', suffixes=('', '_1'))
merge_2 = merge_1.merge(colors_df, left_on='Color2', right_on='Color', suffixes=('', '_2'))

merge_2['R_mean'] = merge_2[['R', 'R_2']].mean(axis=1)
merge_2['G_mean'] = merge_2[['G', 'G_2']].mean(axis=1)
merge_2['B_mean'] = merge_2[['B', 'B_2']].mean(axis=1)

colors = merge_2[['Combined_Color', 'R_mean', 'G_mean', 'B_mean']]

print(colors)

   Combined_Color  R_mean  G_mean  B_mean
0    White-Yellow   255.0   255.0   127.5
1     Pink-Yellow   255.0   223.5   101.5
2     Pale-Yellow   247.5   238.0   107.5
3    Brown-Yellow   210.0   148.5    21.0
4     Gray-Yellow   191.5   191.5    64.0
..            ...     ...     ...     ...
90    Brown-Green    82.5    85.0    21.0
91     Gray-Green    64.0   128.0    64.0
92   Orange-Green   127.5   146.5     0.0
93   Yellow-Green   127.5   191.5     0.0
94     Blue-Green     0.0    64.0   127.5

[95 rows x 4 columns]


In [9]:
# Suppression de la collone COLOR maintenant devenu inutile

champignons = champignons.merge(colors[['Combined_Color', 'R_mean', 'G_mean', 'B_mean']], how='left', left_on='COLOR', right_on='Combined_Color')
champignons.rename(columns={'R_mean': 'R', 'G_mean': 'G', 'B_mean': 'B'}, inplace=True)
champignons.drop(columns=['COLOR', 'Combined_Color'], inplace=True)
champignons[['R', 'G', 'B']] = champignons[['R', 'G', 'B']].fillna(-255)

print(champignons)

      TYPE  SHAPE_Polypore  SHAPE_Convex  SHAPE_Bellshaped  SHAPE_Depressed  \
0      1.0               1             0                 0                0   
1      0.0               0             1                 0                0   
2      0.0               0             1                 0                0   
3      0.0               0             0                 1                0   
4      0.0               0             1                 0                0   
...    ...             ...           ...               ...              ...   
1179  -1.0               0             0                 0                0   
1180  -1.0               0             0                 0                0   
1181  -1.0               0             0                 0                0   
1182  -1.0               0             0                 0                0   
1183  -1.0               0             0                 0                0   

      SHAPE_CupFungi  SHAPE_CoralFungi  SHAPE_Conic