In [1]:
import camelot
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()                   # remove leading/trailing spaces
    name = re.sub(r'\s+', ' ', name)           # collapse multiple spaces
    name = re.sub(r"\s*-\s*.*", "", name)      # remove hyphen + anything after
    name = re.sub(r"\s[A-Za-z]$", "", name)    # remove trailing single letter like " R"
    name = re.sub(r'[^A-Za-z\s]', '', name)    # remove non-letters
    name = re.sub(r'\s+', ' ', name)           # clean up spaces again if needed
    return name.title()                

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [5]:
# Read tables from PDF
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2025-JAN-NOV.pdf"

tables = camelot.read_pdf(pdf_file, pages='1', flavor="stream")

# Combine all extracted tables into one DataFrame
df1 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df1 = df1.iloc[4:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)

df1.columns = range(df1.shape[1])
df1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,,GRAND TOTAL,627459,541008,487471,450493,443247,449811,508726,457438,368922,432938,467942,5235455,100.00%,5350857,-2.16%
1,,OVERSEAS FILIPINO,48678,35830,39254,47055,46810,49659,53702,40470,30861,36086,37179,465584,8.89%,432439,7.66%
2,,FOREIGN TOURIST,578781,505178,448217,403438,396437,400152,455024,416968,338061,396852,430763,4769871,91.11%,4918418,-3.02%
3,1.0,SOUTH KOREA,161920,132732,100410,73278,84588,87728,118356,118681,78009,90251,88155,1134108,21.66%,1435571,-21.00%
4,2.0,UNITED STATES OF AMERICA,110442,96660,78872,75202,81957,101063,87565,58597,54017,71370,79090,894835,17.09%,839635,6.57%
5,3.0,JAPAN,31340,43907,49899,31449,33164,27669,36598,55104,35446,29214,33004,406794,7.77%,352630,15.36%
6,4.0,AUSTRALIA,33958,22996,23597,31334,23134,20960,23029,17225,21748,25442,25469,268892,5.14%,231471,16.17%
7,5.0,CHINA,30924,22326,19417,19994,21837,19087,25557,23088,20510,25411,20188,248339,4.74%,297604,-16.55%
8,6.0,CANADA,30495,25985,21052,21762,20892,15415,20198,16701,14831,19830,23348,230509,4.40%,197375,16.79%
9,7.0,TAIWAN,24347,15923,14172,15154,14089,15674,19660,18251,12692,18130,14572,182664,3.49%,196034,-6.82%


In [18]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='2', flavor="stream")

#Remove the first 4 rows from each page first
cleaned_tables = []
for table in tables:
    df = table.df.iloc[4:].reset_index(drop=True)
    cleaned_tables.append(df)

# Combine all extracted tables into one DataFrame
df2 = pd.concat(cleaned_tables, ignore_index=True)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,52,MYANMAR,316,385,312,334,321,326,303,311,294,397,398,3697,0.07%,8852,-58.24%
1,53,OMAN,351,275,259,310,311,307,490,363,320,334,349,3669,0.07%,3947,-7.04%
2,54,GREECE,464,406,390,255,279,211,161,242,185,279,637,3509,0.07%,3028,15.89%
3,55,MEXICO,329,233,315,349,308,299,279,252,274,350,406,3394,0.06%,3050,11.28%
4,56,CAMBODIA,188,369,198,283,284,286,275,319,427,347,386,3362,0.06%,3753,-10.42%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,101,ZIMBABWE,57,40,47,29,39,32,47,32,41,56,38,458,0.01%,445,2.92%
104,102,LEBANON,26,40,68,46,48,26,37,25,45,43,52,456,0.01%,532,-14.29%
105,103,ISLE OF MAN,53,37,41,45,28,26,55,34,27,63,31,440,0.01%,454,-3.08%
106,104,ECUADOR,35,29,30,32,52,25,27,38,43,68,53,432,0.01%,435,-0.69%


In [17]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='3', flavor="stream")

#Remove the first 4 rows from each page first
cleaned_tables = []
for table in tables:
    df = table.df.iloc[4:].reset_index(drop=True)
    cleaned_tables.append(df)

# Combine all extracted tables into one DataFrame
df22 = pd.concat(cleaned_tables, ignore_index=True)
df22 = df22.drop(columns=[15])
df22.columns = range(df22.shape[1])

df22

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,106,MAURITIUS,44,66,22,41,42,23,25,22,23,24,41,373,0.01%,403,-7.44%
1,107,ARMENIA,38,53,25,17,38,59,34,14,30,24,36,368,0.01%,344,6.98%
2,108,KYRGYZSTAN,65,36,19,22,31,29,35,26,29,34,36,362,0.01%,411,-11.92%
3,109,TANZANIA,26,21,32,19,23,31,39,23,24,55,53,346,0.01%,349,-0.86%
4,110,PANAMA,35,43,24,34,20,13,17,29,25,37,62,339,0.01%,290,16.90%
5,111,BHUTAN,55,14,35,11,23,37,24,18,41,37,40,335,0.01%,331,1.21%
6,112,ETHIOPIA,24,25,45,20,31,18,34,31,33,48,23,332,0.01%,292,13.70%
7,113,JERSEY,51,37,36,31,32,15,16,22,13,14,28,295,0.01%,298,-1.01%
8,114,TURKMENISTAN,41,27,27,34,18,16,22,21,18,30,23,277,0.01%,197,40.61%
9,115,GUATEMALA,17,13,33,29,23,32,22,19,28,19,33,268,0.01%,290,-7.59%


In [21]:
# Read tables from PDF (PAGE 4)
tables = camelot.read_pdf(pdf_file, pages='4', flavor="stream")

# Combine all extracted tables into one DataFrame
df3 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df3 = df3.iloc[4:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df3 = df3.drop(columns=[15])

df3.columns = range(df3.shape[1])
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,156,ARUBA,2,3,6,13,14,4,24,4,16,5,4,95,0.00%,99,-4.04%
1,157,LIBYA,12,5,1,4,3,4,8,5,36,13,3,94,0.00%,78,20.51%
2,158,ALGERIA,8,14,7,8,6,6,13,11,4,6,9,92,0.00%,110,-16.36%
3,158,KIRIBATI,4,3,2,9,15,10,10,8,6,3,22,92,0.00%,80,15.00%
4,158,SEYCHELLES,2,5,9,13,15,23,10,1,4,5,5,92,0.00%,55,67.27%
5,159,BOLIVIA,13,5,10,17,8,10,3,7,9,5,4,91,0.00%,128,-28.91%
6,159,SENEGAL,9,10,3,12,14,5,10,7,7,7,7,91,0.00%,147,-38.10%
7,160,BOTSWANA,9,2,7,11,-,17,12,19,7,3,3,90,0.00%,69,30.43%
8,161,TONGA,3,2,8,7,13,17,8,1,9,6,13,87,0.00%,132,-34.09%
9,162,NICARAGUA,7,8,8,6,2,5,7,13,7,7,8,78,0.00%,112,-30.36%


In [31]:
# Read tables from PDF (PAGE 5)
tables = camelot.read_pdf(pdf_file, pages='5', flavor="stream")

# Combine all extracted tables into one DataFrame
df4= pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df4 = df4.iloc[5:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df4 = df4.drop(columns=[2,4,6,8,10,12,14,16,18,20,22,26,27])
df4 = df4.iloc[:27]
df4.columns = range(df4.shape[1])
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,195,GRENADA,3.0,,3.0,3.0,,2.0,1.0,2.0,2.0,1.0,3.0,20,0.00%,20.0,0.00%
1,196,CENTRAL AFRICAN REPUBLIC,1.0,,1.0,,2.0,1.0,1.0,3.0,2.0,7.0,1.0,19,0.00%,18.0,5.56%
2,197,SAINT VINCENT AND THE GRENADINES,5.0,2.0,3.0,,,2.0,3.0,1.0,,,2.0,18,0.00%,23.0,-21.74%
3,197,SIERRA LEONE,2.0,1.0,,,2.0,2.0,1.0,4.0,3.0,1.0,2.0,18,0.00%,20.0,-10.00%
4,198,CABO VERDE,2.0,4.0,,2.0,3.0,2.0,,,,2.0,2.0,17,0.00%,14.0,21.43%
5,199,MAURITANIA,3.0,1.0,1.0,,3.0,1.0,2.0,3.0,,1.0,1.0,16,0.00%,16.0,0.00%
6,200,"BONAIRE, SINT EUSTATIUS AND SABA",2.0,3.0,,2.0,3.0,,3.0,,1.0,,,14,0.00%,13.0,7.69%
7,200,BURUNDI,,2.0,4.0,,1.0,2.0,,2.0,2.0,,1.0,14,0.00%,46.0,-69.57%
8,200,GUADELOUPE,1.0,1.0,1.0,1.0,3.0,,,3.0,1.0,2.0,1.0,14,0.00%,27.0,-48.15%
9,200,LESOTHO,7.0,4.0,2.0,1.0,,,,,,,,14,0.00%,20.0,-30.00%


In [35]:
combined_df = pd.concat([df1, df2, df22, df3, df4], ignore_index=True)

combined_df.columns = [
        "Rank",
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "Total",
        "Percentage",
        "Previous Total",
        "Growth Rate"
        ]

combined_df = combined_df.drop(columns="Rank")

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)

combined_df['Country'] = combined_df['Country'].replace("United States Of America", "Usa")
combined_df['Country'] = combined_df['Country'].replace("Russia", "Russian Federation")
combined_df['Country'] = combined_df['Country'].replace("Eswatini Fmr Swaziland", "Eswatini")
combined_df['Country'] = combined_df['Country'].replace("Macedonia The Former Yugoslav Republic Of", "Macedonia")
combined_df['Country'] = combined_df['Country'].replace("Saint Helena Ascension And Tristan Da Cunha", "Saint Helena")


combined_df = combined_df.replace([r"^\s*$", "-"], 0, regex=True).fillna(0)

# Take away the percent sign and assign to float
combined_df["Percentage"] = combined_df["Percentage"].str.replace('%', '')
combined_df["Percentage"] = combined_df["Percentage"].astype(float)

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2025cleanData.csv")

In [36]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[3:].reset_index(drop=True) 

months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2025_monthly.csv")
monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2025 Monthly Visitors csv file")


Saved 2025 Monthly Visitors csv file


In [37]:
# Store Special Categories Data

cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(3)

cat_df = cat_df.iloc[:, :-4]

replacements = {
    "Grand Total": "Total",
    "Overseas Filipino": "Overseas Filipinos",
    "Total Foreign": "Foreign Tourists"
}
cat_df["Country"] = cat_df["Country"].replace(replacements)



cat_out_path = os.path.join(cat_folder, f"2025_category.csv")
cat_df.to_csv(cat_out_path, index=False)

print(f"Saved 2025 Category csv file")
cat_df

Saved 2025 Category csv file


Unnamed: 0,Country,January,February,March,April,May,June,July,August,September,October,November
0,Total,627459,541008,487471,450493,443247,449811,508726,457438,368922,432938,467942
1,Overseas Filipinos,48678,35830,39254,47055,46810,49659,53702,40470,30861,36086,37179
2,Foreign Tourist,578781,505178,448217,403438,396437,400152,455024,416968,338061,396852,430763


In [38]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

year_df = combined_df[['Country', 'Total', 'Percentage', 'Previous Total', 'Growth Rate']]
year_df = year_df.iloc[3:,:-2]

year_df.fillna(0, inplace=True)

year_df["Total"] = (
    year_df["Total"].astype(str)
    .str.replace(",", "", regex=False)
    .replace([r"^\s*$", "-"], 0, regex=True)
    .astype(float)
)


In [39]:
input_file_previous = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total/2024_year.csv"

previous_df = pd.read_csv(input_file_previous)

previous_df = previous_df[['Country', 'Total']]

previous_df.rename(columns={'Total': 'Previous Total'}, inplace=True)
previous_df = previous_df.fillna(0)

previous_df["Previous Total"] = (
    previous_df["Previous Total"].astype(str)
    .str.replace(",", "", regex=False)
    .replace([r"^\s*$", "-"], 0, regex=True)
    .astype(float)
)

# Merge the two dataframe together
yearly_total = pd.merge(year_df,previous_df, on = "Country", how='left')
yearly_total = yearly_total.fillna(0)

# Make a column named Denominator where all 0 are replaced as 1 to be use for the Growth Rate Calculation
yearly_total['Denominator'] = yearly_total['Previous Total'].replace(0, 1)

# Calculate Growth Rate
yearly_total['Growth Rate'] = (yearly_total['Total'] - yearly_total['Previous Total']) / yearly_total['Denominator'] * 100

# Drop the Denominator column as it is not needed for the final dataframe
yearly_total = yearly_total.drop(columns='Denominator')

# Round to 2 decimals 
yearly_total['Growth Rate'] = yearly_total['Growth Rate'].round(2)

year_df.fillna(0, inplace=True)

year_path = os.path.join(year_folder, f"2025_year.csv")
yearly_total.to_csv(year_path, index=False)

print(f"Saved 2025 Year csv file")
yearly_total.info()

Saved 2025 Year csv file
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         296 non-null    object 
 1   Total           296 non-null    float64
 2   Percentage      296 non-null    float64
 3   Previous Total  296 non-null    float64
 4   Growth Rate     296 non-null    float64
dtypes: float64(4), object(1)
memory usage: 11.7+ KB
