# Carbon Dioxide Prediction Challenge

## 0. Préalable

Importation des modules

In [1]:
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
# training
from sklearn.model_selection import train_test_split
# metrics
from sklearn.metrics import accuracy_score
# encoding
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

Chargement du jeu de données

In [None]:
data_path = 'data/'
train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')

In [None]:
train

In [None]:
test

## 1. Analyse exploratoire

### 1.1. Analyse de la forme

La target de notre dataset est la variable **?**.

Dimension du dataset

In [None]:
# Let’s observe the shape of our datasets.
print('Dimension du train set :', train.shape)
print('Dimension du test set :', test.shape)

Dimension du train set : (23524, 13)
Dimension du test set : (10086, 12)


Types des variables

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


Analyse des variables qualitatives

In [None]:
train.select_dtypes(include='object').describe()

In [None]:
for colonne in list(train.columns):
    print(colonne)
    print(train[colonne].value_counts(), '\n')

Analyse des variables quantitatives

In [None]:
train.select_dtypes(include='number').describe()

Analyse des données manquantes

In [None]:
train.isnull().sum()

### 1.2. Analyse du fond

Analyse et visualisation de la target (bank_account)

In [None]:
# explorons la distribution de la target
sns.catplot(data=train, x="bank_account", kind="count")
plt.show()

Compréhension des diffiérentes variables

In [None]:
variables_definitions = pd.read_csv('data/VariableDefinitions.csv') # schema de definition des variables
variables_definitions

## 2. Pré-traitement

Fonction réalisant le pré-traitement de notre dataset

In [None]:
def preprocessing_data(data: pd.DataFrame) -> pd.DataFrame:
    ''' Réalise la phase de pré-traitement d'un dataset à savoir l'encodage 
    des variables discrètes et la normalisation de toutes les variables 
    '''
    # copie du dataset
    data1 = data.copy()
    # suppression de la variable d'identifiants uniques
    data1.drop('uniqueid', axis=1, inplace=True)
    # la liste des colonnes discrètes
    object_columns = list(data1.select_dtypes(include='object').columns)
    # la liste des colonnes continues
    number_columns = list(data1.select_dtypes(include='number').columns)
    # encodage des variables discrètes
    encoder = OrdinalEncoder()
    data1[object_columns] = encoder.fit_transform(data1[object_columns])
    # normalisation
    scaler = MinMaxScaler()
    data1[data1.columns] = scaler.fit_transform(data1)
    return data1

In [None]:
processed_train = preprocessing_data(train)
processed_test = preprocessing_data(test)
# processed_test.head(3)