# US National Tourism Data Warehouse from Scratch

In [1]:
import pandas as pd
import numpy as np
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

### Dimension Tables:

    - dim_us_city
    - dim_city_temperature
    - dim_airport
    - dim_country
    - dim_state

## 1. Dimension Table: `dim_us_city` 

In [106]:
us_city_path  = os.getcwd() + '/datasets/us-cities-demographics.csv'
df_city = pd.read_csv(us_city_path, delimiter=';')
df_city.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


For the purpose of this project we're going to select just a few columns and transform some others. e.g: We are going to group the data by City. There's no need to sum the population of male and females because the source already did that for us. We can corroborate this by choosing a random city and compare with this link: https://suburbanstats.org/

Some columns are going to be discarded like race, n˚ of veterans, avg household size, etc. because don't fit in the purpose of the project.

Columns of `us_city`: 
* city (Name of the city)
* male_population (Group by city)
* female_population (Group by city)
* total_population (male + female population)
* state_prefix

In [108]:
df_city = df_city[['City', 'State', 'Male Population', 'Female Population', 'Total Population']].drop_duplicates()
df_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population
0,Silver Spring,Maryland,40601.0,41862.0,82463
1,Quincy,Massachusetts,44129.0,49500.0,93629
2,Hoover,Alabama,38040.0,46799.0,84839
3,Rancho Cucamonga,California,88127.0,87105.0,175232
4,Newark,New Jersey,138040.0,143873.0,281913


### Data Cleaning for `us_city` table

Checking for null values in every column

In [109]:
df_city.isnull().any()

City                 False
State                False
Male Population       True
Female Population     True
Total Population     False
dtype: bool

We have columns `Male Population` and `Female Population` with null values. Let's see what are the other values for those rows.

In [110]:
nan_city = df_city[df_city.isnull().T.any().T]
nan_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population
333,The Villages,Florida,,,72590


We just have one null row. let's change those NaN values using fillna. 

In [111]:
df_city.fillna({'Male Population': 0, 'Female Population': 0}, inplace=True)
df_city.loc[333]

City                 The Villages
State                     Florida
Male Population                 0
Female Population               0
Total Population            72590
Name: 333, dtype: object

In [112]:
df_city.isnull().any()

City                 False
State                False
Male Population      False
Female Population    False
Total Population     False
dtype: bool

Add State Prefix. This column will be useful to match every table of our schema.

In [113]:
from datasets.data import Data

def state_prefix(name):
    prefix = next(key for key,value in Data.states.items() if value==name)
    return prefix
   
df_city['state_prefix'] = df_city['State'].apply(lambda x: state_prefix(x))
df_city.head()

Unnamed: 0,City,State,Male Population,Female Population,Total Population,state_prefix
0,Silver Spring,Maryland,40601.0,41862.0,82463,MD
1,Quincy,Massachusetts,44129.0,49500.0,93629,MA
2,Hoover,Alabama,38040.0,46799.0,84839,AL
3,Rancho Cucamonga,California,88127.0,87105.0,175232,CA
4,Newark,New Jersey,138040.0,143873.0,281913,NJ


In [115]:
df_city[df_city.state_prefix.isnull()] 

Unnamed: 0,City,State,Male Population,Female Population,Total Population,state_prefix


Create a variable with state and city for add state column in dim_city_temperature

In [131]:
state_city = df_city[['state_prefix', 'City']].drop_duplicates().values.tolist()

## 2. Dimension Table: `dim_city_temperature` 

Let's create a Spark Session and write the data in parquet files filtered by Country. 
For our main dataset (Immigration) we have only data for United States that's why we're going to filter this dataset by country. 

In [31]:
# Create a Spark Session 

spark = SparkSession.builder\
    .appName('national_tourism')\
    .getOrCreate()

# df_spark =spark.read.format('csv').option('header', 'true').load(os.getcwd() + '/datasets/GlobalLandTemperaturesByCity.csv')
# df_spark.filter("Country = 'United States'").write.partitionBy('City').parquet('weather.parquet')


In [132]:
# Read parquet file 

df = spark.read.parquet('weather.parquet')
df.limit(5).toPandas()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,Latitude,Longitude,City
0,1758-03-01,6.422999999999999,3.742,United States,37.78N,93.56W,Springfield
1,1758-04-01,12.14,5.432,United States,37.78N,93.56W,Springfield
2,1758-05-01,16.997999999999998,3.76,United States,37.78N,93.56W,Springfield
3,1758-06-01,22.852,3.519,United States,37.78N,93.56W,Springfield
4,1758-07-01,25.195,4.888,United States,37.78N,93.56W,Springfield


Search for null values in AverageTemperature

In [133]:
df.filter('AverageTemperature is null').count()

25765

What's the max date for those values? We need data from 2016 so we can delete other null values.

In [140]:
%%time
max_date = df.filter('AverageTemperature is null').agg({'dt':'max'}).collect()[0][0]
min_date = df.filter('AverageTemperature is null').agg({'dt':'min'}).collect()[0][0]

print(f"Max date for null values: {max_date} \nMin date for null values {min_date}")

Max date for null values: 2013-09-01 
Min date for null values 1743-12-01
CPU times: user 9.53 ms, sys: 4.3 ms, total: 13.8 ms
Wall time: 1.15 s


In [134]:
%%time
max_date = df.filter('AverageTemperature is null').select(F.max('dt').alias("MAX")).limit(1).collect()[0].MAX
min_date = df.filter('AverageTemperature is null').select(F.min('dt').alias('MIN')).limit(1).collect()[0].MIN

print(f"Max date for null values: {max_date} \nMin date for null values {min_date}")

Max date for null values: 2013-09-01 
Min date for null values 1743-12-01
CPU times: user 13 ms, sys: 8.11 ms, total: 21.2 ms
Wall time: 2.44 s


We need to have in mind that we're working with Big Data. That's why is always good to check other alternatives and figure out how much time and CPU are comsume each method. In the example below, the second one is better, less time and less CPU usage! 

Now, it's time to delete those null values. 

In [135]:
df = df.dropna('any')
df.filter('AverageTemperature is null').count()

0

Change `dt` type column to `DateType`

In [136]:
from datetime import datetime
date_udf = F.udf(lambda x: datetime.strptime(x, "%Y-%m-%d"), T.DateType())
df_date = df.withColumn("dt", date_udf(df.dt))
df_date.limit(5).toPandas()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,Latitude,Longitude,City
0,1758-03-01,6.422999999999999,3.742,United States,37.78N,93.56W,Springfield
1,1758-04-01,12.14,5.432,United States,37.78N,93.56W,Springfield
2,1758-05-01,16.997999999999998,3.76,United States,37.78N,93.56W,Springfield
3,1758-06-01,22.852,3.519,United States,37.78N,93.56W,Springfield
4,1758-07-01,25.195,4.888,United States,37.78N,93.56W,Springfield


In [137]:
df_dim_date = df_date.select('dt', 'AverageTemperature', 'Country', 'City')
df_dim_date.limit(5).toPandas()

Unnamed: 0,dt,AverageTemperature,Country,City
0,1758-03-01,6.422999999999999,United States,Springfield
1,1758-04-01,12.14,United States,Springfield
2,1758-05-01,16.997999999999998,United States,Springfield
3,1758-06-01,22.852,United States,Springfield
4,1758-07-01,25.195,United States,Springfield


Add state column into DataFrame

In [145]:
from pyspark.sql.functions import udf

@udf
def state(string):
    
    state = next(state for state, city in state_city if city == string)
    return state
    
df_weather = df_dim_date.withColumn('State', state(df_dim_date.City))
df_weather.limit(10).toPandas()



Unnamed: 0,dt,AverageTemperature,Country,City,State
0,1758-03-01,6.422999999999999,United States,Springfield,IL
1,1758-04-01,12.14,United States,Springfield,IL
2,1758-05-01,16.997999999999998,United States,Springfield,IL
3,1758-06-01,22.852,United States,Springfield,IL
4,1758-07-01,25.195,United States,Springfield,IL
5,1758-08-01,22.704,United States,Springfield,IL
6,1758-09-01,18.44,United States,Springfield,IL
7,1758-11-01,6.992000000000001,United States,Springfield,IL
8,1758-12-01,0.417,United States,Springfield,IL
9,1759-01-01,1.217,United States,Springfield,IL


## 3. Dimension Table: `dim_airport` 

In [32]:
df = spark.read.format("csv").option("header", "true").load(os.getcwd() + "/datasets/airport-codes_csv.csv")
df.limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


As our main dataset is related to US immigration. The table `dim_airport` will only have US airports. 
This dataframe will be filtered by country = 'US'

Columns for `dim_airport`: 
- ident
- type
- name
- iso_country
- state (iso_region column)
- municipality

In [33]:
df_airport = df.filter('iso_country = "US" and municipality is not null').select('ident', 'type', 'name', 'iso_country', 'iso_region', 'municipality')
df_airport.limit(5).toPandas()

Unnamed: 0,ident,type,name,iso_country,iso_region,municipality
0,00A,heliport,Total Rf Heliport,US,US-PA,Bensalem
1,00AA,small_airport,Aero B Ranch Airport,US,US-KS,Leoti
2,00AK,small_airport,Lowell Field,US,US-AK,Anchor Point
3,00AL,small_airport,Epps Airpark,US,US-AL,Harvest
4,00AR,closed,Newport Hospital & Clinic Heliport,US,US-AR,Newport


Create a new column for state using iso_region

In [35]:
udf_state = F.udf(lambda x: x[3::])
df_airport_state = df_airport.withColumn('iso_region', udf_state(df_airport.iso_region))
df_airport_state = df_airport_state.withColumnRenamed('iso_region', 'state')
df_airport_state.limit(5).toPandas()

Unnamed: 0,ident,type,name,iso_country,state,municipality
0,00A,heliport,Total Rf Heliport,US,PA,Bensalem
1,00AA,small_airport,Aero B Ranch Airport,US,KS,Leoti
2,00AK,small_airport,Lowell Field,US,AK,Anchor Point
3,00AL,small_airport,Epps Airpark,US,AL,Harvest
4,00AR,closed,Newport Hospital & Clinic Heliport,US,AR,Newport


In [40]:
df_airport_state.filter('state = "NY" and municipality = "Clayton"').select('municipality').distinct().toPandas()

Unnamed: 0,municipality
0,Clayton



## 4. Dimension Table: `dim_country` 

This table will be the relationship for `Citizen` and `Resident` columns in the fact table immigration.

All data came from `I94_SAS_Labels_Descriptions.SAS` file

Columns: 
- id_country
- country


In [5]:
from datasets.data import Data

columns = ['id_country', 'country']
df_country = pd.DataFrame([(key, value) for key, value in Data.countries.items()], columns=columns)
df_country.head()

Unnamed: 0,id_country,country
0,582,MEXICO
1,236,AFGHANISTAN
2,101,ALBANIA
3,316,ALGERIA
4,102,ANDORRA


## 5. Dimension Table: `dim_state` 

Columns: 
- id_state
- state_prefix
- state_name


In [20]:
columns = ['id_state', 'state_prefix', 'state_name']
df_state = pd.DataFrame([(i, k, v) for i, (k, v) in enumerate(Data.states.items(), 1)], columns=columns)
df_state.head()

Unnamed: 0,id_state,state_prefix,state_name
0,1,AK,Alaska
1,2,AL,Alabama
2,3,AR,Arkansas
3,4,AS,American Samoa
4,5,AZ,Arizona


In [91]:
from datasets.data import Data

state_name = "Alabama"

print(*[k for (k,v) in Data.states.items() if v==state_name])

AL
