In [2]:
import camelot
import pandas as pd
import numpy as np
import re
import os


In [3]:
# Helper codes

def clean_country_name(name):
    name = str(name).strip()              
    name = re.sub(r'\s+', ' ', name)     
    name = re.sub(r"\s*-\s*.*", "", name)
    name = re.sub(r"[^A-Za-z\s]", "", name)
    name = re.sub(r'[^A-Za-z\s]', '', name)  
    return name.title()                   

# --- Constants ---
MONTH_ORDER = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# --- Helpers ---
def get_months(df):
    """Return ordered list of existing month columns."""
    return [m for m in MONTH_ORDER if m in df.columns]


In [4]:
exclude_words = [
    "A S I A",
    "ASEAN",
    "SUB-TOTAL",
    "EAST ASIA",
    "SOUTH ASIA",
    "MIDDLE EAST",
    "A M E R I C A",
    "NORTH AMERICA",
    "SOUTH AMERICA",
    "E U R O P E",
    "WESTERN EUROPE",
    "NORTHERN EUROPE",
    "SOUTHERN EUROPE",
    "EASTERN EUROPE",
    "EASTERN MEDITERRANEAN EUROPE",
    "AUSTRALASIA/PACIFIC",
    "TOTAL (CIS & RUSSIA)"
    "A F R I C A	"
]

In [5]:
# Read tables from PDF (PAGE 1)
pdf_file = "/Users/kim/Desktop/repos/Philippines_Visitor/data/rawData/2015-JAN-DEC.pdf"
tables = camelot.read_pdf(pdf_file, pages='1', flavor="stream")

# Combine all extracted tables into one DataFrame
df1 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df1 = df1.iloc[2:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df1.columns = range(df1.shape[1])

df1.drop(
    df1[df1[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

df1 = df1.reset_index(drop=True).iloc[:-1]
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,BRUNEI,645,709,840,676,1003,607,742,627,739,653,678,1096,9015,0.17,9677,-6.84
1,CAMBODIA,219,233,245,366,325,267,295,311,276,319,414,233,3503,0.07,3276,6.93
2,INDONESIA,3949,3615,4235,3705,4035,3772,4233,4528,3832,4238,4086,3950,48178,0.9,46757,3.04
3,LAOS,93,97,119,104,139,87,98,111,105,91,91,96,1231,0.02,1056,16.57
4,MALAYSIA,12256,11188,13892,13089,14690,12851,12981,13048,13189,11936,13544,13150,155814,2.91,139245,11.9
5,MYANMAR,593,579,682,565,662,649,622,435,515,586,621,524,7033,0.13,6633,6.03
6,SINGAPORE,12995,14279,16942,15816,17363,15422,14585,14515,14122,14366,14928,15843,181176,3.38,179099,1.16
7,THAILAND,3426,3759,4059,4429,3541,3422,3351,3569,3418,3961,3637,3466,44038,0.82,45943,-4.15
8,VIETNAM,2807,2811,2858,2761,2884,2607,2810,2189,2283,2618,2508,2443,31579,0.59,29800,5.97
9,CHINA,21164,41812,30067,33250,31537,32495,66689,65403,51378,42073,37329,37644,490841,9.16,394951,24.28


In [6]:
# Read tables from PDF (PAGE 2)
tables = camelot.read_pdf(pdf_file, pages='2', flavor="stream")

# Combine all extracted tables into one DataFrame
df2 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df2 = df2.iloc[3:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df2.columns = range(df2.shape[1])

df2.drop(
   df2[df2[0].str.upper().isin(exclude_words)].index,
    inplace=True
)

df2 = df2[df2[0] != 'EASTERN MEDITERRANEAN EUROPE**'].reset_index(drop=True)


# DROP TOTAL RUSSIAN FEDERATION 
df2 = df2.reset_index(drop=True)
df2 = df2.drop(26).reset_index(drop=True)
df2 = df2.drop(23).reset_index(drop=True)

In [7]:
# Read tables from PDF (PAGE 3)
tables = camelot.read_pdf(pdf_file, pages='3', flavor="stream")

# Combine all extracted tables into one DataFrame
df3 = pd.concat([table.df for table in tables], ignore_index=True)

# Clean data 
df3 = df3.iloc[2:].reset_index(drop=True) # Drop top 4 rows (not needed for the table)
df3.columns = range(df3.shape[1])

df3.drop(
   df3[df3[0].str.upper().isin(exclude_words)].index,
    inplace=True
)
df3 = df3.drop(index=7).reset_index(drop=True)
df3 = df3.reset_index(drop=True)

# DROP EMPTY ROW
df3 = df3.drop(7).reset_index(drop=True)

# DROP UNWANTED ROWS AT THE END 
df3 = df3.iloc[:-19]


In [8]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

combined_df.columns = [
        "Country",
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
        "Total",
        "Percentage",
        "Previous Total",
        "Growth Rate"
        ]

# Clean and convert column to float
combined_df['Total'] = (
    combined_df['Total']
    .astype(str)               # ensure everything is string
    .str.replace(',', '')      # remove commas
    .str.strip()               # remove leading/trailing spaces
    .replace({'': np.nan, '-': np.nan})  # replace empty strings and '-' with NaN
    .astype(float)             # convert to float
)

combined_df = combined_df.sort_values(by="Total", ascending=False).reset_index(drop=True)

# Copy the row you want to move
row_to_move = combined_df.iloc[6]

# Drop the original row
combined_df = combined_df.drop(6).reset_index(drop=True)

top = combined_df.iloc[:1]   

bottom = combined_df.iloc[1:] 

# Insert the row
combined_df = pd.concat([top, pd.DataFrame([row_to_move]), bottom]).reset_index(drop=True)

combined_df['Country'] = combined_df['Country'].apply(clean_country_name)

# Update country names 
combined_df['Country'] = combined_df['Country'].replace('Independent States', 'Commonwealth of Independent States')

combined_df['Country'] = combined_df['Country'].replace('Hongkong', 'Hong Kong')

combined_df['Country'] = combined_df['Country'].replace('Korea', 'South Korea')

combined_df.to_csv("/Users/kim/Desktop/repos/Philippines_Visitor/data/cleanedData/2015cleanData.csv")

In [9]:
# Store Monthly Data
monthly_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/monthly_visitors"

monthly_df = combined_df.iloc[2:].reset_index(drop=True) 
monthly_df['Country'] = monthly_df['Country'].apply(clean_country_name)

months = get_months(combined_df)

monthly_df = monthly_df[['Country'] + months]

mon_out_path = os.path.join(monthly_folder, f"2015_monthly.csv")

monthly_df.to_csv(mon_out_path, index=False)

print(f"Saved 2015 Monthly Visitors csv file")

Saved 2015 Monthly Visitors csv file


In [10]:
# Store Special Categories Data
cat_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/special_category"

cat_df = combined_df.head(2)
cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
cat_df["Country"] = cat_df["Country"].astype(str).str.strip().str.title()
cat_df["Country"] = cat_df["Country"].apply(clean_country_name)

cat_df = cat_df.iloc[:, :-4]

cat_out_path = os.path.join(cat_folder, f"2015_category.csv")
cat_df.to_csv(cat_out_path, index=False)

print(f"Saved 2015 Category csv file")


Saved 2015 Category csv file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("T O T A L", "TOTAL")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['Country'] = cat_df['Country'].replace("Overseas Filipinos***", "Overseas Filipinos")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df["Country"] = cat_df["Country"].astype

In [11]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         69 non-null     object 
 1   January         69 non-null     object 
 2   February        69 non-null     object 
 3   March           69 non-null     object 
 4   April           69 non-null     object 
 5   May             69 non-null     object 
 6   June            69 non-null     object 
 7   July            69 non-null     object 
 8   August          69 non-null     object 
 9   September       69 non-null     object 
 10  October         69 non-null     object 
 11  November        69 non-null     object 
 12  December        69 non-null     object 
 13  Total           69 non-null     float64
 14  Percentage      69 non-null     object 
 15  Previous Total  69 non-null     object 
 16  Growth Rate     69 non-null     object 
dtypes: float64(1), object(16)
memory usag

In [12]:
year_folder = "/Users/kim/Desktop/repos/Philippines_Visitor/data/yearly_total"

year_df = combined_df[['Country', 'Total', 'Percentage', 'Previous Total', 'Growth Rate']]

year_df = year_df.iloc[2:]

year_df["Percentage"] = pd.to_numeric(year_df["Percentage"], errors = 'coerce')

year_df["Growth Rate"] = pd.to_numeric(year_df["Growth Rate"], errors = 'coerce')

year_df["Previous Total"] = (
    year_df["Previous Total"]
        .astype(str)
        .str.replace('"', '', regex=False)
        .str.replace(',', '', regex=False)   # REMOVE COMMAS
        .str.strip()
)

year_df["Previous Total"] = year_df["Previous Total"].astype(float)

year_path = os.path.join(year_folder, f"2015_year.csv")

year_df.to_csv(year_path, index=False)

print(f"Saved 2015 Year csv file")

Saved 2015 Year csv file
