In [None]:
# Merge all Excel files in the current directory into a single CSV file
import pandas as pd
import glob

excel_files = glob.glob("*.xlsx")

print("Found files:", excel_files)

dfs = []

for file in excel_files:
    try:
        df = pd.read_excel(file, engine="openpyxl")
        df["source_file"] = file   
        dfs.append(df)
        print(f"Loaded: {file} ({df.shape[0]} rows)")
    except Exception as e:
        print(f"Error loading {file}: {e}")

merged = pd.concat(dfs, ignore_index=True)

merged.to_csv("pharmacies_merged_raw_1.csv", index=False, encoding="utf-8")

print("Merged file saved as pharmacies_merged_raw_1.csv")
print("Total rows:", merged.shape[0])

Found files: ['Egyfinder pharmacy webscrape 2 .xlsx', 'Egyfinder pharmacy webscrape 3 .xlsx', 'Egyfinder pharmacy webscrape 4 .xlsx', 'Egyfinder pharmacy webscrape 5 .xlsx', 'Egyfinder pharmacy webscrape 6 .xlsx', 'Egyfinder pharmacy webscrape.xlsx']
Loaded: Egyfinder pharmacy webscrape 2 .xlsx (20 rows)
Loaded: Egyfinder pharmacy webscrape 3 .xlsx (20 rows)
Loaded: Egyfinder pharmacy webscrape 4 .xlsx (20 rows)
Loaded: Egyfinder pharmacy webscrape 5 .xlsx (20 rows)
Loaded: Egyfinder pharmacy webscrape 6 .xlsx (20 rows)
Loaded: Egyfinder pharmacy webscrape.xlsx (20 rows)
Merged file saved as pharmacies_merged_raw_1.csv
Total rows: 120


In [13]:
# Now we can read the merged CSV and start cleaning
import re

df = pd.read_csv("pharmacies_merged_raw_1.csv")
df.head()

Unnamed: 0,web_scraper_order,web_scraper_start_url,PharmacyName,Address,phone,Website,source_file
0,1772212139-1,https://egyfinder.net/categories/en/pharmacies...,Balbaa Pharmacies Miami Branch,189 khaled ibn el walid st. - near to el monta...,35576737.0,,Egyfinder pharmacy webscrape 2 .xlsx
1,1772212139-2,https://egyfinder.net/categories/en/pharmacies...,Dr. Osama El Tayeby Pharmacies Bakoos Branch,86 el fath st. -,16840.0,https://taypharmacies.com/offers/,Egyfinder pharmacy webscrape 2 .xlsx
2,1772212139-3,https://egyfinder.net/categories/en/pharmacies...,El Beisy Pharmacies Semouha Branch,"7 bahaa el din el ghatwary st., off fawzy moaz...",34040235.0,http://www.elbeisy.com/,Egyfinder pharmacy webscrape 2 .xlsx
3,1772212139-4,https://egyfinder.net/categories/en/pharmacies...,Dr. Osama El Tayeby Pharmacies Agami Branch,66 el bitash st. -,34391346.0,https://taypharmacies.com/offers/,Egyfinder pharmacy webscrape 2 .xlsx
4,1772212139-5,https://egyfinder.net/categories/en/pharmacies...,Ibrahim Wasef Pharmacy,107 abdel salam aref st. - beside faculty of f...,35831830.0,,Egyfinder pharmacy webscrape 2 .xlsx


In [14]:
# Select and rename relevant columns
df = df[["PharmacyName", "Address", "phone", "source_file", "web_scraper_start_url"]].copy()
df.rename(columns={
    "PharmacyName": "name",
    "Address": "address",
    "phone": "phone"
}, inplace=True)

In [15]:
# Clean the "name" column
df["name"] = (
    df["name"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .str.title()
)

In [16]:
# Clean the "address" column
df["address"] = (
    df["address"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

In [22]:
# Clean the "phone" column
def clean_phone(p):
    if not isinstance(p, str):
        return None
    p = p.replace(".", "").replace(" ", "").strip()
    p = re.sub(r"[^0-9+]", "", p)
    if p.startswith("0"):
        p = "+20" + p[1:]
    if p.startswith("20"):
        p = "+" + p
    return p if p else None

df["phone"] = df["phone"].astype(str).apply(clean_phone)

In [23]:
# Remove duplicates based on "name" and "address", then based on "address" and "phone"
df = df.drop_duplicates(subset=["name", "address"])
df = df.drop_duplicates(subset=["address", "phone"])

In [25]:


df = pd.read_csv("pharmacies_alexandria_normalized_fin.csv")

df = df.drop(columns=["district"], errors="ignore")

df.to_csv("pharmacies_alexandria_no_district.csv", index=False)

df.head()


Unnamed: 0,name,address,phone,source_file,web_scraper_start_url
0,Balbaa Pharmacies Miami Branch,189 khaled ibn el walid st. - near to el monta...,355767370.0,Egyfinder pharmacy webscrape 2 .xlsx,https://egyfinder.net/categories/en/pharmacies...
1,Dr. Osama El Tayeby Pharmacies Bakoos Branch,86 el fath st. -,168400.0,Egyfinder pharmacy webscrape 2 .xlsx,https://egyfinder.net/categories/en/pharmacies...
2,El Beisy Pharmacies Semouha Branch,"7 bahaa el din el ghatwary st., off fawzy moaz...",340402350.0,Egyfinder pharmacy webscrape 2 .xlsx,https://egyfinder.net/categories/en/pharmacies...
3,Dr. Osama El Tayeby Pharmacies Agami Branch,66 el bitash st. -,343913460.0,Egyfinder pharmacy webscrape 2 .xlsx,https://egyfinder.net/categories/en/pharmacies...
4,Ibrahim Wasef Pharmacy,107 abdel salam aref st. - beside faculty of f...,358318300.0,Egyfinder pharmacy webscrape 2 .xlsx,https://egyfinder.net/categories/en/pharmacies...
