# US National Tourism Data Warehouse from Scratch

In [3]:
import pandas as pd
import numpy as np
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

### Dimension Tables:

    - dim_us_city
    - dim_city_temp
    - dim_airport
    - dim_country
    - dim_state

## 1. Dimension Table: `dim_us_city` 

In [14]:
us_city_path  = os.getcwd() + '/datasets/us-cities-demographics.csv'
df_city = pd.read_csv(us_city_path, delimiter=';')
df_city.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


For the purpose of this project we're going to select just a few columns and transform some others. e.g: We are going to group the data by City. There's no need to sum the population of male and females because the source already did that for us. We can corroborate this by choosing a random city and compare with this link: https://suburbanstats.org/

Some columns are going to be discarded like race, n˚ of veterans, avg household size, etc. because don't fit in the purpose of the project.

Columns of `us_city`: 
* city (Name of the city)
* male_population (Group by city)
* female_population (Group by city)
* total_population (male + female population)
* state_prefix

In [15]:
df_city = df_city[['City', 'State', 'Male Population', 'Female Population', 'Total Population']].drop_duplicates()
df_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population
0,Silver Spring,Maryland,40601.0,41862.0,82463
1,Quincy,Massachusetts,44129.0,49500.0,93629
2,Hoover,Alabama,38040.0,46799.0,84839
3,Rancho Cucamonga,California,88127.0,87105.0,175232
4,Newark,New Jersey,138040.0,143873.0,281913


### Data Cleaning for `us_city` table

Checking for null values in every column

In [16]:
df_city.isnull().any()

City                 False
State                False
Male Population       True
Female Population     True
Total Population     False
dtype: bool

We have columns `Male Population` and `Female Population` with null values. Let's see what are the other values for those rows.

In [17]:
nan_city = df_city[df_city.isnull().T.any().T]
nan_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population
333,The Villages,Florida,,,72590


We just have one null row. let's change those NaN values using fillna. 

In [18]:
df_city.fillna({'Male Population': 0, 'Female Population': 0}, inplace=True)
df_city.loc[333]

City                 The Villages
State                     Florida
Male Population                 0
Female Population               0
Total Population            72590
Name: 333, dtype: object

In [20]:
df_city.isnull().any()

City                 False
State                False
Male Population      False
Female Population    False
Total Population     False
dtype: bool

Add State Prefix. This column will be useful to match every table of our schema.

In [21]:
from datasets.data import Data

def state_prefix(name):
    prefix = next(key for key,value in Data.states.items() if value==name)
    return prefix
   
df_city['state_prefix'] = df_city['State'].apply(lambda x: state_prefix(x))
df_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population,state_prefix
0,Silver Spring,Maryland,40601.0,41862.0,82463,MD
1,Quincy,Massachusetts,44129.0,49500.0,93629,MA
2,Hoover,Alabama,38040.0,46799.0,84839,AL
3,Rancho Cucamonga,California,88127.0,87105.0,175232,CA
4,Newark,New Jersey,138040.0,143873.0,281913,NJ


In [22]:
df_city[df_city.state_prefix.isnull()] 

Unnamed: 0,City,State,Male Population,Female Population,Total Population,state_prefix


Create a variable with state and city for add state column in dim_city_temperature

In [23]:
state_city = df_city[['state_prefix', 'City']].drop_duplicates().values.tolist()

## 2. Dimension Table: `dim_city_temp` 

Let's create a Spark Session, clean and transform the dataset, after that, we can write the dataframe in parquet files. 
For our main dataset (Immigration) we have only data for United States that's why we're going to filter or select (due to the original csv is pivot) this dataset by cities only in US. 

In [4]:
# Create a Spark Session 

spark = SparkSession.builder\
    .appName('national_tourism')\
    .getOrCreate()

Create a list of cities of US based on the dictionary of our dataset

In [5]:
df_cities  = pd.read_csv(os.getcwd() + "/datasets/historical-hourly-weather-data/city_attributes.csv")
us_cities = df_cities[df_cities.Country == "United States"]['City'].values.tolist()

Spark Dataframe for dim table, select column based on US city list.

In [6]:
# path dataset
path_temperature = os.getcwd() + "/datasets/historical-hourly-weather-data/temperature.csv"
# dataframe
df = spark.read.format("csv").option("header", "true").load(path_temperature)
df_city = df.select('datetime', *us_cities)
df_city.limit(5).toPandas()

Unnamed: 0,datetime,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,Denver,...,Indianapolis,Atlanta,Detroit,Jacksonville,Charlotte,Miami,Pittsburgh,Philadelphia,New York,Boston
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,,,,
1,2012-10-01 13:00:00,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,284.61,...,283.85,294.03,284.03,298.17,288.65,299.72,281.0,285.63,288.22,287.17
2,2012-10-01 14:00:00,282.083251974,289.474992813,281.797216632,291.868185522,291.533500952,293.403141271,296.608508543,285.154558187,284.607305531,...,283.889393939,294.03534141,284.069789234,298.205229759,288.650172214,299.732517698,281.024767377,285.663207797,288.24767617,287.186092094
3,2012-10-01 15:00:00,282.091866475,289.460618112,281.789832606,291.862844459,291.543355079,293.392177052,296.631487354,285.233951595,284.5999178,...,283.941919192,294.049702185,284.173964682,298.299595186,288.650581705,299.76657946,281.088318736,285.756824139,288.326939663,287.23167159
4,2012-10-01 16:00:00,282.100480976,289.446243412,281.78244858,291.857503395,291.553209206,293.381212832,296.654466164,285.313345004,284.59253007,...,283.994444444,294.064062959,284.278140131,298.393960613,288.650991196,299.800641223,281.151870096,285.85044048,288.406203155,287.277251086


Unpivot Dataframe

In [9]:
# replace spaces in columns
df_newcolumn = df_city.toDF(*[column.replace(" ", "") for column in df_city.columns])
# unpivot 
stack_statement = "stack(27, 'Portland', Portland, 'SanFrancisco', SanFrancisco, 'Seattle', Seattle, 'LosAngeles', LosAngeles, 'SanDiego', SanDiego, 'LasVegas', LasVegas, 'Phoenix', Phoenix, 'Albuquerque', Albuquerque, 'Denver', Denver, 'SanAntonio', SanAntonio, 'Dallas', Dallas, 'Houston', Houston, 'KansasCity', KansasCity, 'Minneapolis', Minneapolis, 'SaintLouis', SaintLouis, 'Chicago', Chicago, 'Nashville', Nashville, 'Indianapolis', Indianapolis, 'Atlanta', Atlanta, 'Detroit', Detroit, 'Jacksonville', Jacksonville, 'Charlotte', Charlotte, 'Miami', Miami, 'Pittsburgh', Pittsburgh, 'Philadelphia', Philadelphia, 'NewYork', NewYork, 'Boston', Boston) as (City, Temp)"
df_weather = df_newcolumn.selectExpr("Datetime", stack_statement).where("Temp is not null")
df_weather.limit(5).toPandas()

Unnamed: 0,Datetime,City,Temp
0,2012-10-01 13:00:00,Portland,282.08
1,2012-10-01 13:00:00,SanFrancisco,289.48
2,2012-10-01 13:00:00,Seattle,281.8
3,2012-10-01 13:00:00,LosAngeles,291.87
4,2012-10-01 13:00:00,SanDiego,291.53


Change date format (YYYY-MM-DD) and order by datetime, city

In [10]:
from dateutil.parser import parse

datetime_udf = F.udf(lambda x: parse(x), T.DateType())

df_weatherDate = df_weather.withColumn("Datetime", datetime_udf(df_weather.Datetime))\
                .orderBy("Datetime", "City")
df_weatherDate.limit(5).toPandas()

Unnamed: 0,Datetime,City,Temp
0,2012-10-01,Albuquerque,285.154558187
1,2012-10-01,Albuquerque,285.63091864
2,2012-10-01,Albuquerque,285.392738413
3,2012-10-01,Albuquerque,285.313345004
4,2012-10-01,Albuquerque,285.472131822


Avg temperature group by date (YYYY-MM-DD) and City

In [11]:
df_avg_weather =  df_weatherDate.groupBy("Datetime", "City").agg({"Temp": "avg"})
df_avg_weather.limit(5).toPandas()

Unnamed: 0,Datetime,City,avg(Temp)
0,2012-10-01,Albuquerque,285.476208
1,2012-10-01,Atlanta,294.093604
2,2012-10-01,Boston,287.371091
3,2012-10-01,Charlotte,288.651832
4,2012-10-01,Chicago,284.552669


Return name of cities to normal (spaces between words)

In [40]:
from pyspark.sql.functions import udf

replace_cities = {
'SanFrancisco': 'San Francisco',
'LosAngeles': 'Los Angeles',
'SanDiego': 'San Diego',
'LasVegas': 'Las Vegas', 
'SanAntonio': 'San Antonio',
'KansasCity': 'Kansas City',
'SaintLouis': 'Saint Louis',
'NewYork': 'New York',
} 

@udf
def replace_city(name):
    for key, value in replace_cities.items():
        if name == key:
            return value
    return name


df_weatherReplace = df_avg_weather.withColumn('City', replace_city(df_avg_weather.City))
df_weatherReplace.limit(5).toPandas()

Unnamed: 0,Datetime,City,avg(Temp)
0,2012-10-01,Albuquerque,285.476208
1,2012-10-01,Atlanta,294.093604
2,2012-10-01,Boston,287.371091
3,2012-10-01,Charlotte,288.651832
4,2012-10-01,Chicago,284.552669


In [41]:
@udf
def state(string):
    for value in state_city:
        if string == value[1]:
            return value[0] # value[0] is equal to state prefix
    return None # if there's no match return None 
    
df_weatherState = df_weatherReplace.withColumn('State', state(df_weatherReplace.City))
df_weatherState.limit(5).toPandas()

Unnamed: 0,Datetime,City,avg(Temp),State
0,2012-10-01,Albuquerque,285.476208,NM
1,2012-10-01,Atlanta,294.093604,GA
2,2012-10-01,Boston,287.371091,MA
3,2012-10-01,Charlotte,288.651832,NC
4,2012-10-01,Chicago,284.552669,IL


Write Parquet files 

In [43]:
df_weatherState.withColumnRenamed("avg(Temp)", "Temp")\
                    .write.partitionBy("State")\
                    .parquet("weather.parquet")

## 3. Dimension Table: `dim_airport` 

In [32]:
df = spark.read.format("csv").option("header", "true").load(os.getcwd() + "/datasets/airport-codes_csv.csv")
df.limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


As our main dataset is related to US immigration. The table `dim_airport` will only have US airports. 
This dataframe will be filtered by country = 'US'

Columns for `dim_airport`: 
- ident
- type
- name
- iso_country
- state (iso_region column)
- municipality

In [33]:
df_airport = df.filter('iso_country = "US" and municipality is not null').select('ident', 'type', 'name', 'iso_country', 'iso_region', 'municipality')
df_airport.limit(5).toPandas()

Unnamed: 0,ident,type,name,iso_country,iso_region,municipality
0,00A,heliport,Total Rf Heliport,US,US-PA,Bensalem
1,00AA,small_airport,Aero B Ranch Airport,US,US-KS,Leoti
2,00AK,small_airport,Lowell Field,US,US-AK,Anchor Point
3,00AL,small_airport,Epps Airpark,US,US-AL,Harvest
4,00AR,closed,Newport Hospital & Clinic Heliport,US,US-AR,Newport


Create a new column for state using iso_region

In [35]:
udf_state = F.udf(lambda x: x[3::])
df_airport_state = df_airport.withColumn('iso_region', udf_state(df_airport.iso_region))
df_airport_state = df_airport_state.withColumnRenamed('iso_region', 'state')
df_airport_state.limit(5).toPandas()

Unnamed: 0,ident,type,name,iso_country,state,municipality
0,00A,heliport,Total Rf Heliport,US,PA,Bensalem
1,00AA,small_airport,Aero B Ranch Airport,US,KS,Leoti
2,00AK,small_airport,Lowell Field,US,AK,Anchor Point
3,00AL,small_airport,Epps Airpark,US,AL,Harvest
4,00AR,closed,Newport Hospital & Clinic Heliport,US,AR,Newport


In [40]:
df_airport_state.filter('state = "NY" and municipality = "Clayton"').select('municipality').distinct().toPandas()

Unnamed: 0,municipality
0,Clayton



## 4. Dimension Table: `dim_country` 

This table will be the relationship for `Citizen` and `Resident` columns in the fact table immigration.

All data came from `I94_SAS_Labels_Descriptions.SAS` file

Columns: 
- id_country
- country


In [5]:
from datasets.data import Data

columns = ['id_country', 'country']
df_country = pd.DataFrame([(key, value) for key, value in Data.countries.items()], columns=columns)
df_country.head()

Unnamed: 0,id_country,country
0,582,MEXICO
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA


## 5. Dimension Table: `dim_state` 

Columns: 
- id_state
- state_prefix
- state_name


In [20]:
columns = ['id_state', 'state_prefix', 'state_name']
df_state = pd.DataFrame([(i, k, v) for i, (k, v) in enumerate(Data.states.items(), 1)], columns=columns)
df_state.head()

Unnamed: 0,id_state,state_prefix,state_name
0,1,AK,Alaska
1,2,AL,Alabama
2,3,AR,Arkansas
3,4,AS,American Samoa
4,5,AZ,Arizona


In [91]:
from datasets.data import Data

state_name = "Alabama"

print(*[k for (k,v) in Data.states.items() if v==state_name])

AL
