In [2]:
import camelot
import pandas as pd
import numpy as np
import re
import os

In [3]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()                   # remove leading/trailing spaces
    name = re.sub(r'\s+', ' ', name)           # collapse multiple spaces
    name = re.sub(r"\s*-\s*.*", "", name)      # remove hyphen + anything after
    name = re.sub(r"\s[A-Za-z]$", "", name)    # remove trailing single letter like " R"
    name = re.sub(r'[^A-Za-z\s]', '', name)    # remove non-letters
    name = re.sub(r'\s+', ' ', name)           # clean up spaces again if needed
    return name.title()                

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [4]:
# Read tables from PDF
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2024-JAN-DEC.pdf"
tables = camelot.read_pdf(pdf_file, pages='1', flavor="stream")

# Combine all extracted tables into one DataFrame
df1 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df1 = df1.iloc[5:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)


In [5]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='2', flavor="stream")

# Combine all extracted tables into one DataFrame
df2 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df2 = df2.iloc[3:].reset_index(drop=True)

row_idx = 57  # VANUATU
values = [20, 25, 29, 23, 29, 29, 21, 19, 27, 23, 31, 19]

# Insert values starting at column index 2, skipping every other column
for i, val in enumerate(values):
    df2.iat[row_idx, 2 + i * 2] = val

row_idx = 57  # or 59, 61, etc. depending on which row you're fixing

# Clear only odd-numbered columns from index 3 to 25
for col in range(3, 26, 2):
    df2.iat[row_idx, col] = ""

nan_value = float("NaN")
df2.replace("", nan_value, inplace=True)

df2.dropna(how='all', axis=1, inplace=True)
df2.columns = range(df2.shape[1])


  df2.replace("", nan_value, inplace=True)


In [6]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='3', flavor="stream")

# Combine all extracted tables into one DataFrame
df3 = pd.concat([table.df for table in tables], ignore_index=True)
df3 = df3.iloc[3:].reset_index(drop=True)

row_idx = 0  # VANUATU
values = [13, 41, 21, 10, 10, 16, 18, 28, 29, 20, 9, 40]

# Insert values starting at column index 2, skipping every other column
for i, val in enumerate(values):
    df3.iat[row_idx, 2 + i * 2] = val

row_idx = 0  # or 59, 61, etc. depending on which row you're fixing

# Clear only odd-numbered columns from index 3 to 25
for col in range(3, 26, 2):
    df3.iat[row_idx, col] = ""

nan_value = float("NaN")
df3.replace("", nan_value, inplace=True)

df3.dropna(how='all', axis=1, inplace=True)

df3.columns = range(df3.shape[1])


  df3.replace("", nan_value, inplace=True)


In [7]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='4', flavor="stream")

# Combine all extracted tables into one DataFrame
df4 = pd.concat([table.df for table in tables], ignore_index=True)
df4 = df4.iloc[3:].reset_index(drop=True)
df4 = df4.drop(columns=[4, 6, 8, 13, 18, 21])
df4.columns = range(df4.shape[1])


In [8]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='5', flavor="stream")

# Combine all extracted tables into one DataFrame
df5 = pd.concat([table.df for table in tables], ignore_index=True)
df5 = df5.iloc[3:].reset_index(drop=True)
df5 = df5.iloc[:-7]
df5 = df5.drop(columns=[14, 17])
df5.columns = range(df5.shape[1])
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,210,COCOS (KEELING) ISLANDS,-,-,-,-,-,-,-,-,-,-,-,-,-,0.00%,6,-100.00%
1,210,SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS,-,-,-,-,-,-,-,-,-,-,-,-,-,0.00%,2,-100.00%
2,210,WESTERN SAHARA,-,-,-,-,-,-,-,-,-,-,-,-,-,0.00%,1,-100.00%


In [9]:
combined_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

combined_df.columns = [
        "Rank",
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
        "Total",
        "Percentage",
        "Previous Total",
        "Growth Rate"
        ]

combined_df = combined_df.drop(columns="Rank")

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)

combined_df['Country'] = combined_df['Country'].replace("United States Of America", "Usa")
combined_df['Country'] = combined_df['Country'].replace("Russia", "Russian Federation")
combined_df['Country'] = combined_df['Country'].replace("Eswatini Fmr Swaziland", "Eswatini")
combined_df['Country'] = combined_df['Country'].replace("Macedonia The Former Yugoslav Republic Of", "Macedonia")
combined_df['Country'] = combined_df['Country'].replace("Saint Helena Ascension And Tristan Da Cunha", "Saint Helena")

combined_df = combined_df.replace([r"^\s*$", "-"], 0, regex=True).fillna(0)

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2024cleanData.csv")


In [10]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[3:].reset_index(drop=True) 

months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2024_monthly.csv")
monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2024 Monthly Visitors csv file")

Saved 2024 Monthly Visitors csv file


In [13]:
# Store Special Categories Data

cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(3)

cat_df = cat_df.iloc[:, :-4]

cat_out_path = os.path.join(cat_folder, f"2024_category.csv")
cat_df.to_csv(cat_out_path, index=False)

print(f"Saved 2024 Category csv file")
cat_df

Saved 2024 Category csv file


Unnamed: 0,Country,January,February,March,April,May,June,July,August,September,October,November,December
0,Grand Total,574439,582332,505720,459453,447435,459362,525466,472482,411274,441059,471835,598493
1,Overseas Filipinos,32373,31115,30777,29937,45099,50725,57450,43945,34271,38042,38705,77944
2,Foreign Tourists,542066,551217,474943,429516,402336,408637,468016,428537,377003,403017,433130,520549


In [12]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

year_df = combined_df[['Country', 'Total', 'Percentage', 'Previous Total', 'Growth Rate']]
year_df = year_df.iloc[3:]


year_df = (
    year_df
        .apply(lambda col: col.astype(str)
                           .str.replace('"', '', regex=False)
                           .str.replace(',', '', regex=False)
                           .str.replace('%', '', regex=False)
                           .str.strip()
              )
)

year_df["Previous Total"] = year_df["Previous Total"].astype(float)

year_path = os.path.join(year_folder, f"2024_year.csv")
year_df.to_csv(year_path, index=False)

print(f"Saved 2024 Year csv file")

year_df.info()


Saved 2024 Year csv file
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 3 to 246
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         244 non-null    object 
 1   Total           244 non-null    object 
 2   Percentage      244 non-null    object 
 3   Previous Total  244 non-null    float64
 4   Growth Rate     244 non-null    object 
dtypes: float64(1), object(4)
memory usage: 9.7+ KB
