In [1]:
# packages
import pandas as pd

In [2]:
# Sheet 'TOUT'
df_tout = pd.read_excel('Facilis.xlsm', sheet_name='TOUT')

print(df_tout.shape)

df_tout.head(2) # displaying the two first rows

(1092, 20)


Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,Remarques,Unnamed: 8,Unnamed: 9,Unnamed: 10,29,Unnamed: 12,1093,factures,n° sem.1,Unnamed: 16,Unnamed: 17,Unnamed: 18,9
0,Capsule Corp,115968610,74.0,2021-03-29,2021-04-12,15,2021-12-26 19:33:52,,,,,,,,,,,,,
1,Cyberdyne,21040827,1008.0,2021-04-09,2021-04-12,15,2021-12-26 19:34:04,,,,,,,,,,,,,


In [3]:
# Column labels, extracting the 7 first ones (useful)
columns_mask = list(df_tout.columns[:])[:7]
columns_mask

['Fournisseurs',
 'n° facture',
 'Montant',
 'Date de facture',
 "Date d'échéance",
 'n° sem',
 'Mis en paie.']

In [4]:
# new dataframe (table) with the 7 columns
dt_tout = df_tout.loc[:, columns_mask]
dt_tout.shape 

(1092, 7)

In [5]:
dt_tout.head(3) # displaying the 3 first rows

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.
0,Capsule Corp,115968610,74.0,2021-03-29,2021-04-12,15,2021-12-26 19:33:52
1,Cyberdyne,21040827,1008.0,2021-04-09,2021-04-12,15,2021-12-26 19:34:04
2,Geugène Industrie,2103006,2842.94,2021-03-15,2021-04-14,15,2021-12-26 19:34:05


In [6]:
dt_tout.isnull().sum()

Fournisseurs        0
n° facture          0
Montant             0
Date de facture     0
Date d'échéance     0
n° sem              0
Mis en paie.       68
dtype: int64

There are 68 missing values, could be replaced by their corresponding values.
Here, we are interested by adding provider's email coordinates, that we find in "FRS" Sheet.

In [7]:
# fill missing values with strings converted to datetime
# to simplify the task, we replace all "Mis en paie.'" missing values with a same datetime
string = '17000707073000'
first_try = pd.to_datetime(string, format='%Y%m%d%H%M%S') # Formats with Timestamps
first_try

Timestamp('1700-07-07 07:30:00')

In [8]:
dt_tout['Mis en paie.'].fillna( first_try, inplace=True ) # concat['Mis en paie.'] = 

dt_tout.isnull().sum()

Fournisseurs       0
n° facture         0
Montant            0
Date de facture    0
Date d'échéance    0
n° sem             0
Mis en paie.       0
dtype: int64

In [9]:
# Sheet 'Frs'
df_frs = pd.read_excel('Facilis.xlsm', sheet_name='Frs')

print(df_frs.shape)

df_frs.head(3)

(46, 10)


Unnamed: 0,Tous les founisseurs présents feuille 'Tout',Unnamed: 1,Unnamed: 2,Fournisseurs à proposer,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Fournisseurs,E-MAIL
0,Aaltra,,,Aaltra,,,,,Big Jerry Cab Co,compta@bjcc.fr
1,Altra Automotive,,,Altra Automotive,,,,,Crack Frères,contact@crack.fr
2,Beau-Line,,,Beau-Line,,,,,Eurodiscount,contact@eurodiscount.fr


In [10]:
# Rename the first column label
df_frs.rename(columns={"Tous les founisseurs présents feuille 'Tout'":
                       "Fournisseurs feuille 'Tout'"}, 
             inplace=True)

df_frs.head(2)

Unnamed: 0,Fournisseurs feuille 'Tout',Unnamed: 1,Unnamed: 2,Fournisseurs à proposer,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Fournisseurs,E-MAIL
0,Aaltra,,,Aaltra,,,,,Big Jerry Cab Co,compta@bjcc.fr
1,Altra Automotive,,,Altra Automotive,,,,,Crack Frères,contact@crack.fr


In [11]:
# Column labels
# extracting the useful columns

mask = list(df_frs.columns)

mask1 = mask[:1] + mask[3:4]
mask2 = mask[-2:]

mask1, mask2

(["Fournisseurs feuille 'Tout'", 'Fournisseurs à proposer'],
 ['Fournisseurs', 'E-MAIL'])

In [12]:
## Split "FRs" table into 2

# table 1 --> "suggested_dt"   with "suggested providers" 

suggested_dt = df_frs.loc[:, mask1]

print(suggested_dt.shape)
suggested_dt.head(2)

(46, 2)


Unnamed: 0,Fournisseurs feuille 'Tout',Fournisseurs à proposer
0,Aaltra,Aaltra
1,Altra Automotive,Altra Automotive


In [13]:
# table 2 --> "coord_dt"   with "coordinates" 
coord_dt = df_frs.loc[:, mask2]

print(coord_dt.shape)
coord_dt.head(2)

(46, 2)


Unnamed: 0,Fournisseurs,E-MAIL
0,Big Jerry Cab Co,compta@bjcc.fr
1,Crack Frères,contact@crack.fr


In [14]:
coord_dt[coord_dt["Fournisseurs"] == "Word Company"]

Unnamed: 0,Fournisseurs,E-MAIL
10,Word Company,compt@wordco.com


In [15]:
# Final coordinates tables
coord_dt = coord_dt.loc[:10, :]
coord_dt

Unnamed: 0,Fournisseurs,E-MAIL
0,Big Jerry Cab Co,compta@bjcc.fr
1,Crack Frères,contact@crack.fr
2,Eurodiscount,contact@eurodiscount.fr
3,Geugène Industrie,contact@geugene.fr
4,Lacuna Inc,contact@lacunainc.fr
5,Nelson & Murdock,contact@nelsonmurdock.fr
6,Rosen Corporation,contact@rosencorporation.fr
7,Sofroco-Gedec,contact@sofroco-gedec.fr
8,Vladis Entreprise,contact@vladisentreprise.fr
9,Luthor Corp,contact@luther.fr


In [16]:
# Merge two tables (sheet 'TOUT' && { sheet 'FRS' --> table 'coord_dt' } )
concat = pd.merge(dt_tout, coord_dt, how= 'left', on = ['Fournisseurs']) 
concat.shape

(1092, 8)

In [17]:
concat.head() # 5 first rwos

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
0,Capsule Corp,115968610,74.0,2021-03-29,2021-04-12,15,2021-12-26 19:33:52,
1,Cyberdyne,21040827,1008.0,2021-04-09,2021-04-12,15,2021-12-26 19:34:04,
2,Geugène Industrie,2103006,2842.94,2021-03-15,2021-04-14,15,2021-12-26 19:34:05,contact@geugene.fr
3,Luthor Corp,F1920074,1325.0,2021-01-15,2021-04-15,15,2021-12-26 19:34:15,contact@luther.fr
4,Luthor Corp,F1920110,444.0,2021-01-15,2021-04-15,15,2021-12-26 19:34:16,contact@luther.fr


In [18]:
concat.tail() # 5 last rwos

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
1087,Luthor Corp,F1926614,22.0,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1088,Luthor Corp,F1926615,92603.0,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1089,Luthor Corp,F1926616,48.0,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1090,Luthor Corp,F1926617,700.0,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1091,Luthor Corp,F1926618,75.0,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr


In [19]:
# Count missing values
concat.isnull().sum()

Fournisseurs         0
n° facture           0
Montant              0
Date de facture      0
Date d'échéance      0
n° sem               0
Mis en paie.         0
E-MAIL             218
dtype: int64

218 values missing, some provider's email coordinates are not given,

* **We can extract table with not null values**
* **We can fill the missing values using another file (csv, xls, pdf, ...) , another sheet, or User input**

In [20]:
# Extract table with null values
concat[concat["E-MAIL"].isnull()]

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
0,Capsule Corp,115968610,74.00,2021-03-29,2021-04-12,15,2021-12-26 19:33:52,
1,Cyberdyne,21040827,1008.00,2021-04-09,2021-04-12,15,2021-12-26 19:34:04,
19,Drake & Sweeney,FAC7757,780.00,2021-03-17,2021-04-16,15,2021-12-26 19:34:30,
20,Slusho,FV-18156651,1343.00,2020-12-08,2021-04-16,15,2021-12-26 19:34:30,
21,Pyramid Transnational,2021/5855,322043.02,2021-03-19,2021-04-19,16,2021-12-26 19:37:45,
...,...,...,...,...,...,...,...,...
944,Beaumont-Liégard,FTY-1258,2504.00,2021-12-18,2022-01-17,3,2021-12-22 18:42:30,
945,Factory Mode,abc128,20.00,2021-12-18,2022-01-17,3,2021-12-22 18:42:30,
946,Rekall,128,139.00,2021-12-18,2022-01-17,3,2021-12-22 18:42:30,
1009,Sienar Technologies,2021-17258,99195.00,2021-12-06,2022-02-04,5,2021-12-22 21:33:42,


In [21]:
# Extract table with not null values
concat[concat["E-MAIL"].notnull()]

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
2,Geugène Industrie,2103006,2842.94,2021-03-15,2021-04-14,15,2021-12-26 19:34:05,contact@geugene.fr
3,Luthor Corp,F1920074,1325.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:15,contact@luther.fr
4,Luthor Corp,F1920110,444.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:16,contact@luther.fr
5,Luthor Corp,F1920112,757.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:17,contact@luther.fr
6,Luthor Corp,F1920113,925.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:17,contact@luther.fr
...,...,...,...,...,...,...,...,...
1087,Luthor Corp,F1926614,22.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1088,Luthor Corp,F1926615,92603.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1089,Luthor Corp,F1926616,48.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1090,Luthor Corp,F1926617,700.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr


In [22]:
# interact with users, and read the input from the keyboard

val_fournisseur = input("Veuillez entrer le nom de fournisseur: ") # Keyboard --> Capsule Corp
val_email = input("Veuillez entrer son email: ") # Keyboard --> contact@capsule-corp.fr

print("\n")
print("Fournisseur : ", val_fournisseur, " , Email : ", val_email )

Veuillez entrer le nom de fournisseur: Capsule Corp
Veuillez entrer son email: contact@capsule-corp.fr


Fournisseur :  Capsule Corp  , Email :  contact@capsule-corp.fr


In [23]:
# Replace the missing value by the new one
boolean_condition = concat["Fournisseurs"] == val_fournisseur
column_name = "E-MAIL"
new_value = val_email

concat.loc[boolean_condition, column_name] = new_value

In [24]:
concat

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
0,Capsule Corp,115968610,74.00,2021-03-29,2021-04-12,15,2021-12-26 19:33:52,contact@capsule-corp.fr
1,Cyberdyne,21040827,1008.00,2021-04-09,2021-04-12,15,2021-12-26 19:34:04,
2,Geugène Industrie,2103006,2842.94,2021-03-15,2021-04-14,15,2021-12-26 19:34:05,contact@geugene.fr
3,Luthor Corp,F1920074,1325.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:15,contact@luther.fr
4,Luthor Corp,F1920110,444.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:16,contact@luther.fr
...,...,...,...,...,...,...,...,...
1087,Luthor Corp,F1926614,22.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1088,Luthor Corp,F1926615,92603.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1089,Luthor Corp,F1926616,48.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1090,Luthor Corp,F1926617,700.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr


In [25]:
# Reading data from pdf

In [26]:
pip install PyPDF2

You should consider upgrading via the '/Users/kenzaelhoussaini/.pyenv/versions/3.8.6/envs/lewagon/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [27]:
import PyPDF2

pdfFileObj = open('Fournisseurs_coord.pdf', 'rb') # only 3 emails were created
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)


In [28]:
# create page object and extract text
pageObj = pdfReader.getPage(0)
page1 = pageObj.extractText()
page1

'Fournisseurs\n \nE\n-\nMAIL\n \nCyberdyne\n \ncontact@cyberdyne.fr\n \nSlusho\n \ncontact@slusho.fr\n \nRekall\n \ncontact@rekall.fr\n \n \n'

In [29]:
# Transforming data
page1 = page1.replace('\n \n \n','')
page1

'Fournisseurs\n \nE\n-\nMAIL\n \nCyberdyne\n \ncontact@cyberdyne.fr\n \nSlusho\n \ncontact@slusho.fr\n \nRekall\n \ncontact@rekall.fr'

In [30]:
# insert commas to separate variables and then remove excess strings
page1 = page1.replace('\n \n',', ').replace('\n','')
page1

'Fournisseurs, E-MAIL, Cyberdyne, contact@cyberdyne.fr, Slusho, contact@slusho.fr, Rekall, contact@rekall.fr'

In [31]:
#remove excess strings
page1 = page1.strip()
page1 

'Fournisseurs, E-MAIL, Cyberdyne, contact@cyberdyne.fr, Slusho, contact@slusho.fr, Rekall, contact@rekall.fr'

In [32]:
#split string
liste = page1.split(", ")
liste 

['Fournisseurs',
 'E-MAIL',
 'Cyberdyne',
 'contact@cyberdyne.fr',
 'Slusho',
 'contact@slusho.fr',
 'Rekall',
 'contact@rekall.fr']

In [33]:
# Removig columns
L = liste[2:]
L

['Cyberdyne',
 'contact@cyberdyne.fr',
 'Slusho',
 'contact@slusho.fr',
 'Rekall',
 'contact@rekall.fr']

In [34]:
# automate data task with for loop
column_name = "E-MAIL"
for i in range(0,len(L),2):
    #val_fournisseur = L[i]
    #val_email = L[i+1]
    
    boolean_condition = concat["Fournisseurs"] == L[i] #val_fournisseur
    concat.loc[boolean_condition, column_name] = L[i+1] # val_email 


In [35]:
concat

Unnamed: 0,Fournisseurs,n° facture,Montant,Date de facture,Date d'échéance,n° sem,Mis en paie.,E-MAIL
0,Capsule Corp,115968610,74.00,2021-03-29,2021-04-12,15,2021-12-26 19:33:52,contact@capsule-corp.fr
1,Cyberdyne,21040827,1008.00,2021-04-09,2021-04-12,15,2021-12-26 19:34:04,contact@cyberdyne.fr
2,Geugène Industrie,2103006,2842.94,2021-03-15,2021-04-14,15,2021-12-26 19:34:05,contact@geugene.fr
3,Luthor Corp,F1920074,1325.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:15,contact@luther.fr
4,Luthor Corp,F1920110,444.00,2021-01-15,2021-04-15,15,2021-12-26 19:34:16,contact@luther.fr
...,...,...,...,...,...,...,...,...
1087,Luthor Corp,F1926614,22.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1088,Luthor Corp,F1926615,92603.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1089,Luthor Corp,F1926616,48.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr
1090,Luthor Corp,F1926617,700.00,2021-12-10,2022-03-10,10,1700-07-07 07:30:00,contact@luther.fr


In [36]:
# Exporting dataFrame to an excel file
  
# saving the excel
concat.to_excel('Data_with_Coordinates.xlsx')
