# Pre-processed Data: Managing Missing Values

To handle missing values in the dataset, you can use the following Python code:

# Data Cleaning :
Clean useless columns

In [1]:
def delete_columns(df):
    delete_columns = ['EmployeeCount', 'Over18']  # Colonnes à supprimer
    existing_columns = [col for col in delete_columns if col in df.columns]
    return df.drop(columns=existing_columns)


Transform non-numerical to numercial data :

In [2]:
def non_numerical_columns(df):
    df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0, 0:0, 1:1})
    df['BusinessTravel'] = df['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2, 0:0, 1:1, 2:2})
    df['Department'] = df['Department'].map({'Sales': 0, 'Research & Development': 1, 'Human Resources': 2, 0:0, 1:1, 2:2})
    df['EducationField'] = df['EducationField'].map({'Life Sciences': 0, 'Medical': 1, 'Marketing': 2, 'Technical Degree': 3, 'Human Resources': 4, 'Other': 5, 0:0, 1:1, 2:2, 3:3, 4:4, 5:5})
    df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1, 0:0, 1:1})
    df['JobRole'] = df['JobRole'].map({'Sales Executive': 0, 'Research Scientist': 1, 'Laboratory Technician': 2, 'Manufacturing Director': 3, 'Healthcare Representative': 4, 'Manager': 5, 'Sales Representative': 6, 'Research Director': 7, 'Human Resources': 8, 0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8})
    df['MaritalStatus'] = df['MaritalStatus'].map({'Single': 0, 'Married': 1, 'Divorced': 2, 0:0, 1:1, 2:2})
    return df

Merge theses two codes

In [3]:
import pandas as pd

def test_non_numerical_columns():
    df = pd.read_csv('general_data.csv')
    df = delete_columns(df)
    df = non_numerical_columns(df)
    print(df.info())
test_non_numerical_columns()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   int64  
 2   BusinessTravel           4410 non-null   int64  
 3   Department               4410 non-null   int64  
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   int64  
 7   EmployeeID               4410 non-null   int64  
 8   Gender                   4410 non-null   int64  
 9   JobLevel                 4410 non-null   int64  
 10  JobRole                  4410 non-null   int64  
 11  MaritalStatus            4410 non-null   int64  
 12  MonthlyIncome            4410 non-null   int64  
 13  NumCompaniesWorked       4391 non-null   float64
 14  PercentSalaryHike       

# Categorization data to complet whole in data : Mode 
Check the data missing

In [4]:
import pandas as pd

file_path = 'general_data.csv'
data = pd.read_csv(file_path)

for col in data.columns:
    print(f"Colonne: {col}, Valeurs manquantes: {data[col].isna().sum()} ,taux valuers manquantes: {round(data[col].isna().sum()/len(data[col])*100,2)}%")



Colonne: Age, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: Attrition, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: BusinessTravel, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: Department, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: DistanceFromHome, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: Education, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: EducationField, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: EmployeeCount, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: EmployeeID, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: Gender, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: JobLevel, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: JobRole, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: MaritalStatus, Valeurs manquantes: 0 ,taux valuers manquantes: 0.0%
Colonne: MonthlyIncome, Valeurs manq

Delete the columns if 40% of missing value, and filled missing value.

In [5]:
import pandas as pd

def fill_missing_columns(df):
    # Suppression des colonnes avec plus de 'threshold' valeurs manquantes
    missing_percentage = df.isna().sum() / len(df)
    columns_to_drop = missing_percentage[missing_percentage > 0.4].index
    print(f"Colonnes supprimées (plus de {0.4*100}% de valeurs manquantes) : {list(columns_to_drop)}")
    df = df.drop(columns=columns_to_drop)
    
    # Remplissage des valeurs manquantes avec le mode pour les colonnes restantes
    for column in df.columns:
        if df[column].isnull().sum() > 0:  # Vérifie s'il y a des valeurs manquantes
            mode_value = df[column].mode()[0]  # Récupère le mode de la colonne
            df[column].fillna(mode_value, inplace=True)  # Remplace les NaN par le mode  
    return df


Test

In [None]:
import pandas as pd

def test_non_numerical_columns():
    df = pd.read_csv('general_data.csv')
    df = delete_columns(df)
    df = non_numerical_columns(df)
    df = fill_missing_columns(df)
    pd.set_option('display.max_rows', None)  # Afficher toutes les lignes
    pd.set_option('display.max_columns', None)  # Afficher toutes les colonnes

    # Afficher toutes les données
    print(df.head())

test_non_numerical_columns()