## Importing the libraries


In [119]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Importing the dataset


In [120]:
df = pd.read_csv('dataset/operations.csv')

In [121]:
df.head(10)

Unnamed: 0,date_operation,libelle,montant,solde_avt_ope,categ
0,2023-03-31,DON XX XX XX XX XX XX XX,-1.44,1515.25,AUTRE
1,2023-04-03,CARTE XX XX RAPT XX,-24.0,1513.81,TRANSPORT
2,2023-04-03,CARTE XX XX RAPT XX,-73.0,1489.81,TRANSPORT
3,2023-04-03,VIREMENT XX XX XX XX XX XX XX XX XX XX XX XX,676.0,1416.81,AUTRE
4,2023-04-03,VIREMENT XX XX XX XX XX XX,4.8,2092.81,AUTRE
5,2023-04-03,CARTE XX XX XX XX,-14.39,2097.61,AUTRE
6,2023-04-05,CARTE XX XX XX XX XX,-15.2,2083.22,AUTRE
7,2023-04-05,CARTE XX XX XX XX,-12.0,2068.02,AUTRE
8,2023-04-05,PRELEVEMENT XX TELEPHONE XX XX,-7.02,2056.02,FACTURE TELEPHONE
9,2023-04-05,CARTE XX XX LES ANCIENS ROBINSON XX,-6.8,2049.0,COURSES


In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date_operation  309 non-null    object 
 1   libelle         309 non-null    object 
 2   montant         307 non-null    float64
 3   solde_avt_ope   309 non-null    float64
 4   categ           308 non-null    object 
dtypes: float64(2), object(3)
memory usage: 12.2+ KB


In [123]:
df.shape

(309, 5)

In [124]:
df.describe()


Unnamed: 0,montant,solde_avt_ope
count,307.0,309.0
mean,-45.911889,3396.117799
std,874.240605,666.18028
min,-15000.0,1416.81
25%,-20.495,3010.76
50%,-9.6,3453.96
75%,-2.71,3783.86
max,1071.6,4709.31


In [125]:
df.isnull().sum()

date_operation    0
libelle           0
montant           2
solde_avt_ope     0
categ             1
dtype: int64

## Cleaning Data

In [126]:
df['montant'].fillna(df['montant'].mean(),inplace=True)  
df['solde_avt_ope'].fillna(df['solde_avt_ope'].mean(),inplace=True) 
df = df[df['categ'].notna()]

In [127]:
df.isnull().sum()

date_operation    0
libelle           0
montant           0
solde_avt_ope     0
categ             0
dtype: int64

In [128]:
print(df['libelle'].value_counts())
print("-"*30)
print(df['montant'].value_counts())
print("-"*30)
print(df['categ'].value_counts())

libelle
CARTE XX XX XX XX XX                            68
CARTE XX XX XX XX                               43
CARTE XX XX XX XX XX XX                         35
CARTE XX XX LES ANCIENS ROBINSON XX             21
CARTE XX XX CHEZ LUC XX                         16
CARTE XX XX L'EPICERIE DEMBAS XX XX             16
CARTE XX XX TOUPTIPRI XX                        12
RETRAIT XX XX XX                                 9
CARTE XX XX RAPT XX                              8
DON XX XX XX XX XX XX XX                         7
FORFAIT COMPTE SUPERBANK XX XX XX XX             7
VIREMENT XX XX XX XX XX XX XX XX XX XX XX XX     7
CARTE XX XX LA LOUVE XX XX                       7
XX XX VIREMENT XX XX XX                          6
PRELEVEMENT XX TELEPHONE XX XX                   6
VIREMENT PERMANENT LOYER                         6
CARTE XX XX XX XX XX XX XX XX                    5
VIREMENT XX XX XX XX XX XX XX                    5
CARTE XX XX VELOC XX XX                          4
CARTE XX XX LA CCNCF XX

In [129]:
df['libelle'] = df['libelle'].str.replace('X' ,'')

In [130]:
df.head()

Unnamed: 0,date_operation,libelle,montant,solde_avt_ope,categ
0,2023-03-31,DON,-1.44,1515.25,AUTRE
1,2023-04-03,CARTE RAPT,-24.0,1513.81,TRANSPORT
2,2023-04-03,CARTE RAPT,-73.0,1489.81,TRANSPORT
3,2023-04-03,VIREMENT,676.0,1416.81,AUTRE
4,2023-04-03,VIREMENT,4.8,2092.81,AUTRE


In [131]:
print(df['libelle'].value_counts())

libelle
CARTE                            68
CARTE                            43
CARTE                            35
CARTE   LES ANCIENS ROBINSON     21
CARTE   CHEZ LUC                 16
CARTE   L'EPICERIE DEMBAS        16
CARTE   TOUPTIPRI                12
RETRAIT                           9
CARTE   RAPT                      8
DON                               7
FORFAIT COMPTE SUPERBANK          7
VIREMENT                          7
CARTE   LA LOUVE                  7
  VIREMENT                        6
PRELEVEMENT  TELEPHONE            6
VIREMENT PERMANENT LOYER          6
CARTE                             5
VIREMENT                          5
CARTE   VELOC                     4
CARTE   LA CCNCF                  4
CARTE   LA CCNCF                  3
VIREMENT                          3
CARTE                             2
CARTE     LA CCNCF                2
VIREMENT                          1
                                  1
  CARTE                           1
CARTE               

In [132]:
df['libelle'] = df['libelle'].str.replace(' ' ,'')

In [133]:
print(df['libelle'].value_counts())

libelle
CARTE                      155
VIREMENT                    23
CARTELESANCIENSROBINSON     21
CARTECHEZLUC                16
CARTEL'EPICERIEDEMBAS       16
CARTETOUPTIPRI              12
CARTELACCNCF                 9
RETRAIT                      9
CARTERAPT                    8
DON                          7
FORFAITCOMPTESUPERBANK       7
CARTELALOUVE                 7
PRELEVEMENTTELEPHONE         6
VIREMENTPERMANENTLOYER       6
CARTEVELOC                   4
                             1
PRELEVEMENT                  1
Name: count, dtype: int64


In [138]:
df.loc[df['libelle'] == "CARTELESANCIENSROBINSON", 'libelle'] = "CARTE LES ANCIENS ROBINSON"
df.loc[df['libelle'] == "CARTECHEZLUC" , 'libelle'] = "CARTE CHEZ LUCN"
df.loc[df['libelle'] == "CARTEL'EPICERIEDEMBAS" , 'libelle'] = "CARTE L'EPICERIE DEMBAS"
df.loc[df['libelle'] == "CARTETOUPTIPRI" , 'libelle'] = "CARTE TOUPTIPRI"
df.loc[df['libelle'] == "CARTELACCNCF" , 'libelle'] = "CARTE LA CCNCF"
df.loc[df['libelle'] == "CARTERAPT" , 'libelle'] = "CARTE RAPT"
df.loc[df['libelle'] == "FORFAITCOMPTESUPERBANK" , 'libelle'] = "FORFAIT COMPTE SUPERBANKT"
df.loc[df['libelle'] == "CARTELALOUVE" , 'libelle'] = "CARTE LA LOUVE"
df.loc[df['libelle'] == "PRELEVEMENTTELEPHONE" , 'libelle'] = "PRELEVEMENT TELEPHONEE"
df.loc[df['libelle'] == "VIREMENTPERMANENTLOYER" , 'libelle'] = "VIREMENT PERMANENT LOYER"
df.loc[df['libelle'] == "CARTEVELOC" , 'libelle'] = "CARTE VELOC"
df.loc[df['libelle'] == "" , 'libelle'] = "CARTE"

In [139]:
df.iloc[  :20 , :]

Unnamed: 0,date_operation,libelle,montant,solde_avt_ope,categ
0,2023-03-31,DON,-1.44,1515.25,AUTRE
1,2023-04-03,CARTE RAPT,-24.0,1513.81,TRANSPORT
2,2023-04-03,CARTE RAPT,-73.0,1489.81,TRANSPORT
3,2023-04-03,VIREMENT,676.0,1416.81,AUTRE
4,2023-04-03,VIREMENT,4.8,2092.81,AUTRE
5,2023-04-03,CARTE,-14.39,2097.61,AUTRE
6,2023-04-05,CARTE,-15.2,2083.22,AUTRE
7,2023-04-05,CARTE,-12.0,2068.02,AUTRE
8,2023-04-05,PRELEVEMENT TELEPHONEE,-7.02,2056.02,FACTURE TELEPHONE
9,2023-04-05,CARTE LES ANCIENS ROBINSON,-6.8,2049.0,COURSES


In [140]:
print(df['libelle'].value_counts())

libelle
CARTE                         156
VIREMENT                       23
CARTE LES ANCIENS ROBINSON     21
CARTE CHEZ LUCN                16
CARTE L'EPICERIE DEMBAS        16
CARTE TOUPTIPRI                12
RETRAIT                         9
CARTE LA CCNCF                  9
CARTE RAPT                      8
DON                             7
FORFAIT COMPTE SUPERBANKT       7
CARTE LA LOUVE                  7
PRELEVEMENT TELEPHONEE          6
VIREMENT PERMANENT LOYER        6
CARTE VELOC                     4
PRELEVEMENT                     1
Name: count, dtype: int64


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 308 entries, 0 to 308
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date_operation  308 non-null    object 
 1   libelle         308 non-null    object 
 2   montant         308 non-null    float64
 3   solde_avt_ope   308 non-null    float64
 4   categ           308 non-null    object 
dtypes: float64(2), object(3)
memory usage: 14.4+ KB


In [142]:
df.head()

Unnamed: 0,date_operation,libelle,montant,solde_avt_ope,categ
0,2023-03-31,DON,-1.44,1515.25,AUTRE
1,2023-04-03,CARTE RAPT,-24.0,1513.81,TRANSPORT
2,2023-04-03,CARTE RAPT,-73.0,1489.81,TRANSPORT
3,2023-04-03,VIREMENT,676.0,1416.81,AUTRE
4,2023-04-03,VIREMENT,4.8,2092.81,AUTRE
