In [1]:
import camelot
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()                   # remove leading/trailing spaces
    name = re.sub(r'\s+', ' ', name)           # collapse multiple spaces
    name = re.sub(r"\s*-\s*.*", "", name)      # remove hyphen + anything after
    name = re.sub(r"\s[A-Za-z]$", "", name)    # remove trailing single letter like " R"
    name = re.sub(r'[^A-Za-z\s]', '', name)    # remove non-letters
    name = re.sub(r'\s+', ' ', name)           # clean up spaces again if needed
    return name.title()                

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [3]:
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2023-JAN-DEC.pdf"

tables = camelot.read_pdf(pdf_file, pages='1', flavor='stream')

cleaned_tables = []
for table in tables:
    df = table.df.iloc[5:].reset_index(drop=True)
    cleaned_tables.append(df)

df1 = pd.concat(cleaned_tables, ignore_index=True)
df1.iloc[:3, 1] = df1.iloc[:3, 2]
df1 = df1.drop(columns=[2])
df1.columns = range(df1.shape[1])


In [4]:
tables = camelot.read_pdf(pdf_file, pages='2-4', flavor="stream")

cleaned_tables = []
for table in tables:
    df = table.df.iloc[8:].reset_index(drop=True)
    cleaned_tables.append(df)

df2_3_4 = pd.concat(cleaned_tables, ignore_index=True)
df2_3_4 = df2_3_4.iloc[:, :-1]


In [5]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='5', flavor="stream")

cleaned_tables = []
for table in tables:
    df = table.df.iloc[4:].reset_index(drop=True)
    cleaned_tables.append(df)

df5 = pd.concat(cleaned_tables, ignore_index=True)

df5 = df5.drop(columns=[2,4,7,9,11,13,15,17,20])
df5.columns = range(df5.shape[1])


In [6]:
# Read tables from PDF
tables = camelot.read_pdf(pdf_file, pages='6', flavor="stream")

cleaned_tables = []
for table in tables:
    df = table.df.iloc[4:].reset_index(drop=True)
    cleaned_tables.append(df)

df6 = pd.concat(cleaned_tables, ignore_index=True)
df6 = df6.iloc[:-8]
df6 = df6.drop(columns=[3,4,6,9,13,16,19,17,20])
df6.at[9, 15] = 1
df6.columns = range(df6.shape[1])
df6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,196,NORTH KOREA r,-,,,-,2.0,-,-,,-,-,,1,3,0.00%
1,197,MONTSERRAT,-,,,-,,-,-,2.0,-,-,,-,2,0.00%
2,197,SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS,,,,,,,1,,-,-,1.0,-,2,0.00%
3,197,VATICAN CITY STATE,-,1.0,,-,1.0,-,-,,-,-,,-,2,0.00%
4,198,MACEDONIA,-,,,-,,-,-,,-,-,1.0,-,1,0.00%
5,198,MAYOTTE,-,,,-,,-,-,,-,-,,1,1,0.00%
6,198,SAINT BARTHÃ©LEMY,-,,,-,,-,-,,-,-,1.0,-,1,0.00%
7,198,SAINT PIERRE AND MIQUELON,,,,,,,,,1,,,-,1,0.00%
8,198,WALLIS AND FUTUNA,,,1.0,-,,-,-,,-,-,,-,1,0.00%
9,198,WESTERN SAHARA,-,,,-,,-,-,,1,-,,-,1,0.00%


In [7]:
combined_df = pd.concat([df1, df2_3_4, df5, df6], ignore_index=True)

combined_df.columns = [
        "Rank",
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
        "Total",
        "Percentage"
]
combined_df = combined_df.drop(columns="Rank")

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)


combined_df['Country'] = combined_df['Country'].replace("United States Of America", "Usa")
combined_df['Country'] = combined_df['Country'].replace("Russia", "Russian Federation")
combined_df['Country'] = combined_df['Country'].replace("Ivory Coast", "Cote Divoire Ivory Coast")
combined_df['Country'] = combined_df['Country'].replace("Runion", "Reunion")
combined_df['Country'] = combined_df['Country'].replace("Eswatini Fmr Swaziland", "Eswatini")
combined_df['Country'] = combined_df['Country'].replace("Macedonia The Former Yugoslav Republic Of", "Macedonia")
combined_df['Country'] = combined_df['Country'].replace("Land Islands", "Aland Islands")
combined_df['Country'] = combined_df['Country'].replace("Curaao", "Curacao")
combined_df['Country'] = combined_df['Country'].replace("Saint Helena Ascension And Tristan Da Cunha", "Saint Helena")
combined_df['Country'] = combined_df['Country'].replace("Saint Barthlemy", "Saint Barthelemy")


combined_df = combined_df.replace([r"^\s*$", "-"], 0, regex=True).fillna(0)

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2023cleanData.csv")


In [8]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[3:].reset_index(drop=True) 


months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2023_monthly.csv")
monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2023 Monthly Visitors csv file")


Saved 2023 Monthly Visitors csv file


In [9]:
# Store Special Categories Data

cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(3)

cat_df = cat_df.iloc[:, :-2]

cat_out_path = os.path.join(cat_folder, f"2023_category.csv")
cat_df.to_csv(cat_out_path, index=False)

print(f"Saved 2023 Category csv file")


Saved 2023 Category csv file


In [10]:
year_df = combined_df[['Country', 'Total', 'Percentage']]
year_df = year_df.iloc[3:]

# Take away the percent sign and assign to float
year_df["Percentage"] = year_df["Percentage"].str.replace('%', '')
year_df["Percentage"] = year_df["Percentage"].astype(float)


year_df["Total"] = (
    year_df["Total"].astype(str)
    .str.replace(",", "", regex=False)
    .replace([r"^\s*$", "-"], 0, regex=True)
    .astype(float)
)

year_df


Unnamed: 0,Country,Total,Percentage
3,South Korea,1450858.0,26.62
4,Usa,903299.0,16.57
5,Japan,305580.0,5.61
6,Australia,266551.0,4.89
7,China,263836.0,4.84
...,...,...,...
233,Mayotte,1.0,0.00
234,Saint Barthelemy,1.0,0.00
235,Saint Pierre And Miquelon,1.0,0.00
236,Wallis And Futuna,1.0,0.00


In [11]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

input_file_previous = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total/2022_year.csv"

previous_df = pd.read_csv(input_file_previous)

previous_df = previous_df[['Country', 'Total']]
previous_df.rename(columns={'Total': 'Previous Total'}, inplace=True)
previous_df = previous_df.fillna(0)

# Merge the two dataframe together
yearly_total = pd.merge(year_df,previous_df, on = "Country", how='left')
yearly_total = yearly_total.fillna(0)

# Make a column named Denominator where all 0 are replaced as 1 to be use for the Growth Rate Calculation
yearly_total['Denominator'] = yearly_total['Previous Total'].replace(0, 1)

# Calculate Growth Rate
yearly_total['Growth Rate'] = (yearly_total['Total'] - yearly_total['Previous Total']) / yearly_total['Denominator'] * 100

# Drop the Denominator column as it is not needed for the final dataframe
yearly_total = yearly_total.drop(columns='Denominator')

# Round to 2 decimals 

yearly_total['Growth Rate'] = yearly_total['Growth Rate'].round(2)

year_df.fillna(0, inplace=True)

# Save as a csv file
year_path = os.path.join(year_folder, f"2023_year.csv")
yearly_total.to_csv(year_path, index=False)

print(f"Saved 2023 Year csv file")

yearly_total.info()

Saved 2023 Year csv file
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         239 non-null    object 
 1   Total           239 non-null    float64
 2   Percentage      239 non-null    float64
 3   Previous Total  239 non-null    float64
 4   Growth Rate     239 non-null    float64
dtypes: float64(4), object(1)
memory usage: 9.5+ KB
