In [2]:
import numpy as np
import pandas as pd



from IPython.display import display

pd.options.display.max_columns = None

In [3]:
data = pd.read_csv('../data/Covid-19-Algeria.csv')

data.shape

(167, 8)

In [4]:
data.columns

Index(['date ', 'Wilaya', 'Cas suspects ', 'Cas confirmés (Cumulés)',
       'Nombre de décés ', 'Nombre de patients rétablis ',
       'Nouveau cas au niveau de l'Algérie ',
       'Décés au niveau de l'Algérie (Cumul)'],
      dtype='object')

In [5]:
old_columns_names = list(data.columns)
new_columns_names = [column.lower().strip().replace(' ', '_').replace('é', 'e').replace("'","_") for column in old_columns_names]

replace_columns_names = dict()

for index, keys in enumerate(old_columns_names):
    replace_columns_names[keys] = new_columns_names[index]
    
data.rename(columns = replace_columns_names, inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  167 non-null    object 
 1   wilaya                                167 non-null    object 
 2   cas_suspects                          3 non-null      float64
 3   cas_confirmes_(cumules)               166 non-null    float64
 4   nombre_de_deces                       163 non-null    float64
 5   nombre_de_patients_retablis           64 non-null     float64
 6   nouveau_cas_au_niveau_de_l_algerie    162 non-null    float64
 7   deces_au_niveau_de_l_algerie_(cumul)  164 non-null    float64
dtypes: float64(6), object(2)
memory usage: 10.6+ KB


In [7]:
data.date.unique()
data.date = pd.to_datetime(data.date, format='%d/%m/%Y')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   date                                  167 non-null    datetime64[ns]
 1   wilaya                                167 non-null    object        
 2   cas_suspects                          3 non-null      float64       
 3   cas_confirmes_(cumules)               166 non-null    float64       
 4   nombre_de_deces                       163 non-null    float64       
 5   nombre_de_patients_retablis           64 non-null     float64       
 6   nouveau_cas_au_niveau_de_l_algerie    162 non-null    float64       
 7   deces_au_niveau_de_l_algerie_(cumul)  164 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 10.6+ KB


In [9]:
data.wilaya = [wilaya.strip().title() for wilaya in data.wilaya]
data.wilaya.sort_values().unique()
data.replace(to_replace=['Tbessa', 'Ouergla'],
            value=['Tebessa', 'Ouargla'],
            inplace=True)
data.wilaya.sort_values().unique()

array(['Adrar', 'Ain Defla', 'Ain Timouchent', 'Alger', 'Annaba', 'Batna',
       'Bechar', 'Bejaia', 'Biskra', 'Blida', 'Bordj Bou Arreridj',
       'Bouira', 'Boumerdes', 'Canstantine', 'Chlef', 'Djelfa',
       'El Bayad', 'El Oued', 'El Tarf', 'Gherdaia', 'Guelma', 'Illizi',
       'Jijel', 'Khenchla', 'Leghouat', "M'Sila", 'Mascara', 'Medea',
       'Mila', 'Mostaganem', 'Naama', 'Oran', 'Ouargla', 'Oum El Bouaghi',
       'Relizane', 'Sidi Bel Abbas', 'Skikda', 'Souk Ahras', 'Sétif',
       'Tebessa', 'Tiaret', 'Tipaza', 'Tissemssilt', 'Tizi Ouazou',
       'Tlemcen'], dtype=object)

In [10]:
data.wilaya.sort_values().unique().shape

(45,)

### Missing Values:

`python data.isnull().sum()` Will give us a count of missing values in each columns.

In [11]:
data.isnull().sum()

date                                      0
wilaya                                    0
cas_suspects                            164
cas_confirmes_(cumules)                   1
nombre_de_deces                           4
nombre_de_patients_retablis             103
nouveau_cas_au_niveau_de_l_algerie        5
deces_au_niveau_de_l_algerie_(cumul)      3
dtype: int64

* As we ca see that most of values in columns `cas_suspects` are **missing**. in this case we have only 3 non missing values we can drop this columns it isn't an informative.

In [15]:
data.drop(labels = 'cas_suspects', axis=1, inplace=True)

* The confirmed cases column `cas_confirmes_(cumules)` contain only one missing value this, we can substitute this value with 

In [36]:
data.groupby('date', as_index=False).sum().sort_values(by='date')

Unnamed: 0,date,cas_confirmes_(cumules),nombre_de_deces,nombre_de_patients_retablis,nouveau_cas_au_niveau_de_l_algerie,deces_au_niveau_de_l_algerie_(cumul)
0,2002-03-15,2.0,0.0,0.0,11.0,4.0
1,2020-02-25,1.0,0.0,0.0,1.0,0.0
2,2020-02-28,1.0,0.0,0.0,0.0,0.0
3,2020-02-29,1.0,0.0,0.0,0.0,0.0
4,2020-03-01,3.0,0.0,0.0,2.0,0.0
5,2020-03-02,5.0,0.0,0.0,2.0,0.0
6,2020-03-03,228.0,12.0,10.0,185.0,105.0
7,2020-03-06,17.0,0.0,0.0,0.0,0.0
8,2020-03-07,19.0,0.0,0.0,2.0,0.0
9,2020-03-08,20.0,0.0,0.0,1.0,0.0


In [27]:
data.replace(to_replace={'cas_confirmes_(cumules)':-1}, value=1, inplace=True)

In [28]:
data.loc[data.wilaya == 'Blida']

Unnamed: 0,date,wilaya,cas_confirmes_(cumules),nombre_de_deces,nombre_de_patients_retablis,nouveau_cas_au_niveau_de_l_algerie,deces_au_niveau_de_l_algerie_(cumul)
0,2020-02-25,Blida,1.0,0.0,0.0,1.0,0.0
1,2020-02-28,Blida,1.0,0.0,0.0,0.0,0.0
2,2020-02-29,Blida,1.0,0.0,0.0,0.0,0.0
3,2020-03-01,Blida,3.0,0.0,0.0,2.0,0.0
4,2020-03-02,Blida,5.0,0.0,0.0,2.0,0.0
5,2020-03-03,Blida,5.0,0.0,0.0,0.0,0.0
6,2020-05-05,Blida,17.0,0.0,0.0,12.0,0.0
7,2020-03-06,Blida,17.0,0.0,0.0,0.0,0.0
8,2020-03-07,Blida,19.0,0.0,0.0,2.0,0.0
9,2020-03-08,Blida,20.0,0.0,0.0,1.0,0.0
