In [1]:
import camelot
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()              
    name = re.sub(r'\s+', ' ', name)     
    name = re.sub(r"\s*-\s*.*", "", name)
    name = re.sub(r"[^A-Za-z\s]", "", name)
    name = re.sub(r'[^A-Za-z\s]', '', name)  
    return name.title()                   

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [3]:
exclude_words = [
    "A S I A",
    "ASEAN",
    "SUB-TOTAL",
    "EAST ASIA",
    "SOUTH ASIA",
    "MIDDLE EAST",
    "A M E R I C A",
    "NORTH AMERICA",
    "SOUTH AMERICA",
    "E U R O P E",
    "WESTERN EUROPE",
    "NORTHERN EUROPE",
    "SOUTHERN EUROPE",
    "EASTERN EUROPE",
    "EASTERN MEDITERRANEAN EUROPE",
    "EASTERN MEDITERRANEAN EUROPE**"
    "AUSTRALASIA/PACIFIC",
    "TOTAL (CIS & RUSSIA)"
    "A F R I C A	"
]

In [4]:
# Read tables from PDF (PAGE 1)
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2019-JAN-DEC.pdf"
tables = camelot.read_pdf(pdf_file, pages='1', flavor="stream")

# Combine all extracted tables into one DataFrame
df1 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df1 = df1.iloc[2:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df1.columns = range(df1.shape[1])

df1.drop(
    df1[df1[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

df1 = df1.iloc[:-1]
df1 = df1.reset_index(drop=True)


In [5]:
# Read tables from PDF (PAGE 2)
tables = camelot.read_pdf(pdf_file, pages='2', flavor="stream")

# Combine all extracted tables into one DataFrame
df2 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df2 = df2.iloc[3:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df2.columns = range(df2.shape[1])

df2.drop(
   df2[df2[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

# df2 = df2.drop(columns=[4,5,7,15])
# df2.columns = range(df2.shape[1])
df2 = df2[df2[0] != 'EASTERN MEDITERRANEAN EUROPE**'].reset_index(drop=True)

df2 = df2.reset_index(drop=True)
df2 = df2.iloc[:-1]


# DROP TOTAL RUSSIAN FEDERATION 
df2 = df2.reset_index(drop=True)
df2 = df2.drop(26).reset_index(drop=True)
df2 = df2.drop(23).reset_index(drop=True)

df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,ARGENTINA,571.0,400,371,358,303,178,156,177,197,253,349,330,3643,0.04,3468,5.05
1,BRAZIL,1047.0,909,1020,960,766,547,450,729,717,734,855,1301,10035,0.12,7364,36.27
2,COLOMBIA,298.0,253,292,302,268,241,256,250,289,256,381,300,3386,0.04,2389,41.73
3,PERU,144.0,125,120,134,150,110,99,180,129,139,153,177,1660,0.02,1038,59.92
4,VENEZUELA,59.0,57,65,67,52,40,43,68,59,57,70,65,702,0.01,423,65.96
5,ANDORRA,,1,2,-,2,2,-,3,1,-,3,11,25,0.0,273,-90.84
6,AUSTRIA,1761.0,1712,1390,1176,1019,875,1284,843,724,984,1130,1942,14840,0.18,14193,4.56
7,BELGIUM,1585.0,1513,1699,1967,1271,1523,2316,1175,1060,1320,1642,2085,19156,0.23,17285,10.82
8,FRANCE,9285.0,12934,10714,10380,5882,3765,5564,5638,3903,5640,6999,7873,88577,1.07,74400,19.06
9,GERMANY,11081.0,11569,11075,9485,6917,5195,6857,6512,5732,7698,8862,12773,103756,1.26,92098,12.66


In [6]:
# Read tables from PDF (PAGE 3)
tables = camelot.read_pdf(pdf_file, pages='3', flavor="stream")

# Combine all extracted tables into one DataFrame
df3 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df3 = df3.iloc[3:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df3.columns = range(df3.shape[1])

df3.drop(
   df3[df3[0].str.upper().isin(exclude_words)].index,
    inplace=True
)
df3 = df3.drop([6,10]).reset_index(drop=True)
df3 = df3.reset_index(drop=True)
df3 = df3.iloc[:-19]
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,AUSTRALIA,29929,21571,21647,28683,22021,19163,21364,17347,21346,23096,23068,36935,286170,3.46,279828,2.27
1,GUAM,1806,1850,1801,1401,1680,2367,2235,1226,818,1540,1667,1444,19835,0.24,32357,-38.7
2,NAURU,1,6,33,2,11,8,2,5,4,3,17,1,93,0.0,38,144.74
3,NEW ZEALAND,3496,2300,2627,3770,3026,2782,3193,2488,2870,3189,3257,4874,37872,0.46,33341,13.59
4,PAPUA NEW GUINEA,908,529,677,647,707,875,785,690,829,945,685,551,8828,0.11,8481,4.09
5,NIGERIA,268,243,255,254,519,285,279,297,340,220,291,188,3439,0.04,2104,63.45
6,SOUTH AFRICA,705,697,710,765,552,561,571,597,585,768,652,1390,8553,0.1,7543,13.39
7,RESIDENCES,9055,9306,8615,7014,5648,5814,6214,5381,5532,6217,9155,8848,86799,1.05,71007,22.24
8,T O T A L,718118,762437,709399,653336,612861,638440,712285,699933,604552,634786,679273,763057,8188477,99.12,7096594,15.39
9,OVERSEAS FILIPINOS***,5272,4428,4910,9651,8858,5340,6772,2910,2001,3763,4790,13741,72436,0.88,71873,0.78


In [7]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

combined_df.columns = [
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
        "Total",
        "Percentage",
        "Previous Total",
        "Growth Rate"
        ]

# Clean and convert column to float
combined_df['Total'] = (
    combined_df['Total']
    .astype(str)               # ensure everything is string
    .str.replace(',', '')      # remove commas
    .str.strip()               # remove leading/trailing spaces
    .replace({'': np.nan, '-': np.nan})  # replace empty strings and '-' with NaN
    .astype(float)             # convert to float
)
combined_df = combined_df.sort_values(by="Total", ascending=False).reset_index(drop=True)

# Copy the row you want to move
row_to_move = combined_df.iloc[16]

# Drop the original row
combined_df = combined_df.drop(16).reset_index(drop=True)

top = combined_df.iloc[:1]   
bottom = combined_df.iloc[1:] 

# Insert the row
combined_df = pd.concat([top, pd.DataFrame([row_to_move]), bottom]).reset_index(drop=True)

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)
combined_df['Country'] = combined_df['Country'].replace('Independent States', 'Commonwealth of Independent States')
combined_df['Country'] = combined_df['Country'].replace('Hongkong', 'Hong Kong')
combined_df['Country'] = combined_df['Country'].replace('Korea', 'South Korea')

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2019cleanData.csv")

In [8]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[2:].reset_index(drop=True) 
monthly_df['Country'] = monthly_df['Country'].apply(clean_country_name)

months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2019_monthly.csv")
monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2019 Monthly Visitors csv file")
monthly_df

Saved 2019 Monthly Visitors csv file


Unnamed: 0,Country,January,February,March,April,May,June,July,August,September,October,November,December
0,South Korea,179900,180890,158794,130707,138239,157495,167249,185334,152184,158380,176185,203965
1,China,131196,174175,158433,139177,130788,132792,171848,170903,150505,139707,126785,117000
2,Usa,109342,94632,89806,87710,90979,96735,93912,68418,61085,79716,82617,109488
3,Japan,47567,64365,65837,57724,46495,44919,55727,78699,56878,51414,53784,59379
4,Taiwan,21804,30648,25456,25441,25637,30546,32663,32676,27707,29642,23350,21703
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Laos,72,61,108,91,90,102,144,113,131,160,280,102
64,Luxembourg,72,92,58,87,43,31,66,104,37,63,69,115
65,Venezuela,59,57,65,67,52,40,43,68,59,57,70,65
66,Nauru,1,6,33,2,11,8,2,5,4,3,17,1


In [9]:
# Store Special Categories Data

cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(2)
cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
cat_df["Country"] = cat_df["Country"].astype(str).str.strip().str.title()
cat_df["Country"] = cat_df["Country"].apply(clean_country_name)

cat_df = cat_df.iloc[:, :-4]

cat_out_path = os.path.join(cat_folder, f"2019_category.csv")
cat_df.to_csv(cat_out_path, index=False)

print(f"Saved 2019 Category csv file")


Saved 2019 Category csv file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df["Country"] = cat_df["Country"].astype

In [10]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

year_df = combined_df[['Country', 'Total', 'Percentage', 'Previous Total', 'Growth Rate']]
year_df = year_df.iloc[2:]

year_df["Previous Total"] = (
    year_df["Previous Total"]
        .astype(str)
        .str.replace('"', '', regex=False)
        .str.replace(',', '', regex=False)   # REMOVE COMMAS
        .str.strip()
)

year_df["Previous Total"] = year_df["Previous Total"].astype(float)

year_df["Growth Rate"] = pd.to_numeric(year_df["Growth Rate"], errors = 'coerce')
year_df["Percentage"] = pd.to_numeric(year_df["Percentage"], errors = 'coerce')

year_path = os.path.join(year_folder, f"2019_year.csv")
year_df.to_csv(year_path, index=False)

print(f"Saved 2019 Year csv file")

Saved 2019 Year csv file
