In [1]:
import camelot
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()              
    name = re.sub(r'\s+', ' ', name)     
    name = re.sub(r"\s*-\s*.*", "", name)
    name = re.sub(r"[^A-Za-z\s]", "", name)
    name = re.sub(r'[^A-Za-z\s]', '', name)  
    return name.title()                   

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [3]:
exclude_words = [
    "A S I A",
    "ASEAN",
    "SUB-TOTAL",
    "EAST ASIA",
    "SOUTH ASIA",
    "MIDDLE EAST",
    "A M E R I C A",
    "NORTH AMERICA",
    "SOUTH AMERICA",
    "E U R O P E",
    "WESTERN EUROPE",
    "NORTHERN EUROPE",
    "SOUTHERN EUROPE",
    "EASTERN EUROPE",
    "EASTERN MEDITERRANEAN EUROPE",
    "AUSTRALASIA/PACIFIC",
    "TOTAL (CIS & RUSSIA)"
    "A F R I C A	",
]

In [4]:
# Read tables from PDF (PAGE 1)
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2020-JAN-DEC.pdf"
tables = camelot.read_pdf(pdf_file, pages='1', flavor="stream")

# Combine all extracted tables into one DataFrame
df1 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df1 = df1.iloc[2:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df1.columns = range(df1.shape[1])

df1.drop(
    df1[df1[0].str.upper().isin(exclude_words)].index,
    inplace=True
)
df1 = df1.drop(columns=[4,6])
df1.columns = range(df1.shape[1])
df1 = df1.iloc[:-1]

df1 = df1.reset_index(drop=True)


In [5]:
# Read tables from PDF (PAGE 2)
tables = camelot.read_pdf(pdf_file, pages='2', flavor="stream")

# Combine all extracted tables into one DataFrame
df2 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df2 = df2.iloc[3:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df2.columns = range(df2.shape[1])

df2.drop(
   df2[df2[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

df2 = df2.drop(columns=[4,5,7,15])
df2.columns = range(df2.shape[1])

df2 = df2.reset_index(drop=True)

# DROP TOTAL RUSSIAN FEDERATION 
df2 = df2.reset_index(drop=True)
df2 = df2.drop(26).reset_index(drop=True)
df2 = df2.drop(23).reset_index(drop=True)

df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,ARGENTINA,711,426.0,223,3.0,-,2,-,11,11.0,12,13,20,1432,0.1,3643,-60.69
1,BRAZIL,1109,940.0,439,7.0,1,3,3,22,62.0,44,56,64,2750,0.19,10035,-72.6
2,COLOMBIA,334,234.0,93,2.0,-,-,2,11,9.0,10,14,13,722,0.05,3386,-78.68
3,PERU,160,123.0,49,1.0,-,-,2,4,1.0,4,2,4,350,0.02,1660,-78.92
4,VENEZUELA,80,41.0,26,,-,-,1,4,10.0,5,11,7,185,0.01,702,-73.65
5,ANDORRA,5,,3,,-,-,-,-,,-,-,-,8,0.0,25,-68.0
6,AUSTRIA,1682,1609.0,474,,2,-,8,11,16.0,16,21,36,3875,0.26,14840,-73.89
7,BELGIUM,1634,1308.0,542,5.0,-,18,24,21,37.0,44,52,71,3756,0.25,19156,-80.39
8,FRANCE,9405,10457.0,4068,15.0,2,20,32,95,85.0,119,81,151,24530,1.65,88577,-72.31
9,GERMANY,10272,10504.0,4248,27.0,36,21,96,71,101.0,144,144,229,25893,1.75,103756,-75.04


In [6]:
# Read tables from PDF (PAGE 3)
tables = camelot.read_pdf(pdf_file, pages='3', flavor="stream")

# Combine all extracted tables into one DataFrame
df3 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df3 = df3.iloc[3:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df3.columns = range(df3.shape[1])

df3.drop(
   df3[df3[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

df3 = df3.drop([6,10]).reset_index(drop=True)
df3 = df3.drop(columns=[4])

df3.columns = range(df3.shape[1])

df3 = df3.iloc[:-16]
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,AUSTRALIA,28884,19204.0,6553,8,4,9,43,69,102.0,122,163,169,55330,3.73,286170,-80.67
1,GUAM,1351,1041.0,383,-,-,2,9,3,5.0,30,12,46,2882,0.19,19835,-85.47
2,NAURU,2,,1,-,-,-,1,-,,-,1,-,5,0.0,93,-94.62
3,NEW ZEALAND,3854,2122.0,768,3,2,5,11,14,24.0,23,30,27,6883,0.46,37872,-81.83
4,PAPUA NEW GUINEA,792,339.0,198,2,-,10,8,24,11.0,4,19,27,1434,0.1,8828,-83.76
5,NIGERIA,416,379.0,209,1,-,-,1,7,2.0,8,10,9,1042,0.07,3439,-69.7
6,SOUTH AFRICA,1106,466.0,260,12,5,2,15,63,44.0,65,56,65,2159,0.15,8553,-74.76
7,RESIDENCES,10969,7598.0,3290,119,18,66,214,441,601.0,724,737,813,25590,1.73,86799,-70.52
8,T O T A L,782132,439852.0,127721,927,357,1186,3380,5364,6410.0,8304,9069,13753,1398455,94.33,8188477,-82.92
9,OVERSEAS FILIPINOS*,14032,22829.0,11604,21,4138,336,2138,12966,3637.0,1804,4535,6040,84080,5.67,72436,16.07


In [7]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

combined_df.columns = [
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
        "Total",
        "Percentage",
        "Previous Total",
        "Growth Rate"
        ]

# Clean and convert column to float
combined_df['Total'] = (
    combined_df['Total']
    .astype(str)               # ensure everything is string
    .str.replace(',', '')      # remove commas
    .str.strip()               # remove leading/trailing spaces
    .replace({'': np.nan, '-': np.nan})  # replace empty strings and '-' with NaN
    .astype(float)             # convert to float
)
combined_df = combined_df.sort_values(by="Total", ascending=False).reset_index(drop=True)

# Copy the row you want to move
row_to_move = combined_df.iloc[5]

# Drop the original row
combined_df = combined_df.drop(5).reset_index(drop=True)

top = combined_df.iloc[:1]   
bottom = combined_df.iloc[1:] 

# Insert the row
combined_df = pd.concat([top, pd.DataFrame([row_to_move]), bottom]).reset_index(drop=True)

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)
combined_df['Country'] = combined_df['Country'].replace('Independent States', 'Commonwealth of Independent States')
combined_df['Country'] = combined_df['Country'].replace('Hongkong', 'Hong Kong')
combined_df['Country'] = combined_df['Country'].replace('Korea', 'South Korea')

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2020cleanData.csv")

In [8]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[2:].reset_index(drop=True) 
monthly_df['Country'] = monthly_df['Country'].apply(clean_country_name)

months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2020_monthly.csv")
monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2020 Monthly Visitors csv file")
monthly_df

Saved 2020 Monthly Visitors csv file


Unnamed: 0,Country,January,February,March,April,May,June,July,August,September,October,November,December
0,South Korea,206062,115943,14751,79,32,47,202,344,236,323,321,537
1,Usa,106444,72602,20656,110,56,415,993,954,1073,1843,2137,4533
2,China,153109,11313,4132,3,2,66,174,84,247,346,448,508
3,Japan,50667,60583,20424,68,28,60,446,532,611,874,1024,1347
4,Australia,28884,19204,6553,8,4,9,43,69,102,122,163,169
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Laos,88,85,17,1,-,1,3,1,,-,2,5
64,Venezuela,80,41,26,,-,-,1,4,10,5,11,7
65,Luxembourg,77,55,15,,-,-,-,-,,3,2,4
66,Andorra,5,,3,,-,-,-,-,,-,-,-


In [11]:
# Store Special Categories Data

cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(2)
cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
cat_df["Country"] = cat_df["Country"].astype(str).str.strip().str.title()
cat_df["Country"] = cat_df["Country"].apply(clean_country_name)

cat_df = cat_df.iloc[:, :-4]

# cat_out_path = os.path.join(cat_folder, f"2020_category.csv")
# cat_df.to_csv(cat_out_path, index=False)
cat_df.info()
print(f"Saved 2020 Category csv file")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    2 non-null      object
 1   January    2 non-null      object
 2   February   2 non-null      object
 3   March      2 non-null      object
 4   April      2 non-null      object
 5   May        2 non-null      object
 6   June       2 non-null      object
 7   July       2 non-null      object
 8   August     2 non-null      object
 9   September  2 non-null      object
 10  October    2 non-null      object
 11  November   2 non-null      object
 12  December   2 non-null      object
dtypes: object(13)
memory usage: 336.0+ bytes
Saved 2020 Category csv file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df["Country"] = cat_df["Country"].astype

In [10]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

year_df = combined_df[['Country', 'Total', 'Percentage', 'Previous Total', 'Growth Rate']]
year_df = year_df.iloc[2:]

year_df["Previous Total"] = (
    year_df["Previous Total"]
        .astype(str)
        .str.replace('"', '', regex=False)
        .str.replace(',', '', regex=False)   # REMOVE COMMAS
        .str.strip()
)

year_df["Previous Total"] = year_df["Previous Total"].astype(float)

year_df["Growth Rate"] = pd.to_numeric(year_df["Growth Rate"], errors = 'coerce')
year_df["Percentage"] = pd.to_numeric(year_df["Percentage"], errors = 'coerce')

year_path = os.path.join(year_folder, f"2020_year.csv")
year_df.to_csv(year_path, index=False)

print(f"Saved 2020 Year csv file")


Saved 2020 Year csv file
