In [6]:
import pandas as pd
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, round
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format,dayofweek
from pyspark.sql.functions import expr
from pyspark.sql.functions import unix_timestamp,from_unixtime
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DoubleType,DecimalType,TimestampType,ArrayType

In [2]:
spark = SparkSession.builder.\
config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
.enableHiveSupport().getOrCreate()


In [8]:
airport_file = "../data/airport-codes_csv.csv"
city_fiele = "../data/us-cities-demographics.csv"
temperature_file = "/data2/GlobalLandTemperaturesByCity.csv"
immigration_dir = "/data/18-83510-I94-Data-2016/"
immigration_dimesions= "../data/I94_SAS_Labels_Descriptions.SAS"
immigration_sample ="../data/immigration_data_sample.csv"

i94addrl_file="../output/i94addrl.csv"
i94cntyl_file="../output/i94cntyl.csv"
i94model_file="../output/i94model.csv"
i94prtl_file="../output/i94prtl.csv"
i94visa_file="../output/i94visa.csv"

* sas_data seems fine, except many fields are double instead of integer

In [3]:
df_spark=spark.read.parquet("../output/sas_data")

In [4]:
df_spark.show(1)
df_spark.head()
df_spark.count()
df_spark.printSchema()

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|5748517.0|2016.0|   4.0| 245.0| 438.0|    LOS|20574.0|    1.0|     CA|20582.0|  40.0|    1.0|  1.0|20160430|     SYD| null|      G|      O|   null|      M| 1976.0|10292016|     F|  null|     QF|9.495387003E10|00011|      B1|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+--

In [9]:
# 3 million people enter U.S. each month
df_spark.select("cicid").distinct().count()

3096313

In [10]:
# New York recieve most people 
df_spark.groupby("i94port").count().orderBy(col("count"),ascending=False).show()

+-------+------+
|i94port| count|
+-------+------+
|    NYC|485916|
|    MIA|343941|
|    LOS|310163|
|    SFR|152586|
|    ORL|149195|
|    HHW|142720|
|    NEW|136122|
|    CHI|130564|
|    HOU|101481|
|    FTL| 95977|
|    ATL| 92579|
|    LVG| 89280|
|    AGA| 80919|
|    WAS| 74835|
|    DAL| 71809|
|    BOS| 57354|
|    SEA| 47719|
|    PHO| 38890|
|    DET| 37832|
|    TAM| 25632|
+-------+------+
only showing top 20 rows



In [11]:
df_spark.groupby("visatype").count().orderBy(col("count"),ascending=False).show()

+--------+-------+
|visatype|  count|
+--------+-------+
|      WT|1309059|
|      B2|1117897|
|      WB| 282983|
|      B1| 212410|
|     GMT|  89133|
|      F1|  39016|
|      E2|  19383|
|      CP|  14758|
|      E1|   3743|
|       I|   3176|
|      F2|   2984|
|      M1|   1317|
|      I1|    234|
|     GMB|    150|
|      M2|     49|
|     SBP|     11|
|     CPL|     10|
+--------+-------+



In [12]:
# most people enter U.S. through air
df_spark.groupby("i94mode").count().orderBy(col("count"),ascending=False).show()

+-------+-------+
|i94mode|  count|
+-------+-------+
|    1.0|2994505|
|    3.0|  66660|
|    2.0|  26349|
|    9.0|   8560|
|   null|    239|
+-------+-------+



In [13]:
df_spark.groupby("visapost").count().orderBy(col("count"),ascending=False).show()

+--------+-------+
|visapost|  count|
+--------+-------+
|    null|1881250|
|     MEX|  84720|
|     SPL|  65678|
|     BNS|  62032|
|     GUZ|  48298|
|     BGT|  46074|
|     CRS|  37137|
|     BEJ|  36703|
|     SHG|  35507|
|     GDL|  30970|
|     RDJ|  29943|
|     TLV|  28903|
|     BMB|  28108|
|     MDR|  26497|
|     GYQ|  26231|
|     SDO|  20924|
|     MNL|  19513|
|     MTR|  18105|
|     LMA|  17479|
|     SNJ|  16717|
+--------+-------+
only showing top 20 rows



In [14]:
df_spark.groupby("gender").count().orderBy(col("count"),ascending=False).show()

+------+-------+
|gender|  count|
+------+-------+
|     M|1377224|
|     F|1302743|
|  null| 414269|
|     X|   1610|
|     U|    467|
+------+-------+



In [15]:
df_spark.groupby("biryear").count().orderBy(col("biryear")).show(100)

+-------+-----+
|biryear|count|
+-------+-----+
|   null|  802|
| 1902.0|    1|
| 1905.0|    1|
| 1906.0|    1|
| 1907.0|    2|
| 1908.0|    2|
| 1909.0|    1|
| 1911.0|    2|
| 1913.0|    1|
| 1914.0|    4|
| 1915.0|    2|
| 1916.0|   24|
| 1917.0|   19|
| 1918.0|   26|
| 1919.0|   52|
| 1920.0|   46|
| 1921.0|   88|
| 1922.0|  104|
| 1923.0|  185|
| 1924.0|  241|
| 1925.0|  319|
| 1926.0|  463|
| 1927.0|  638|
| 1928.0|  884|
| 1929.0| 1204|
| 1930.0| 1594|
| 1931.0| 1999|
| 1932.0| 2500|
| 1933.0| 2965|
| 1934.0| 3784|
| 1935.0| 4629|
| 1936.0| 5635|
| 1937.0| 6813|
| 1938.0| 8019|
| 1939.0| 9340|
| 1940.0|10897|
| 1941.0|12305|
| 1942.0|14198|
| 1943.0|16238|
| 1944.0|18559|
| 1945.0|19988|
| 1946.0|24891|
| 1947.0|28451|
| 1948.0|29576|
| 1949.0|32063|
| 1950.0|33667|
| 1951.0|34141|
| 1952.0|37002|
| 1953.0|38333|
| 1954.0|41352|
| 1955.0|43914|
| 1956.0|45950|
| 1957.0|44921|
| 1958.0|45853|
| 1959.0|47674|
| 1960.0|49722|
| 1961.0|50288|
| 1962.0|53865|
| 1963.0|55673|
| 1964.0

In [16]:
df_airline=df_spark.groupby("airline").count().orderBy(col("count"),ascending=False)
df_airline.show(10)
df_airline.count()

+-------+------+
|airline| count|
+-------+------+
|     AA|310091|
|     UA|264271|
|     DL|252526|
|     BA|190997|
|     LH|120556|
|     VS|113384|
|   null| 83627|
|     AF| 81113|
|     KE| 71047|
|     JL| 69075|
+-------+------+
only showing top 10 rows



535

In [7]:

df_airline=df_spark.groupby("visapost").count().orderBy(col("count"),ascending=False)
df_airline.show(10)
df_airline.count()

+--------+-------+
|visapost|  count|
+--------+-------+
|    null|1881250|
|     MEX|  84720|
|     SPL|  65678|
|     BNS|  62032|
|     GUZ|  48298|
|     BGT|  46074|
|     CRS|  37137|
|     BEJ|  36703|
|     SHG|  35507|
|     GDL|  30970|
+--------+-------+
only showing top 10 rows



531

* latest data in temperature data is 2013, but the immigration data we want to analysis is 2016, so temperature data is useless.

In [11]:
schema = StructType([
    StructField('date', TimestampType(), True),
    StructField('temperature', DoubleType(), True),
    StructField('uncertainty', DoubleType(), True),
    StructField('city', StringType(), True),
    StructField('country', StringType(), True),
    StructField('latitude', StringType(), True),
    StructField('longitude', StringType(), True),
    
])
df_temperature_raw=spark.read.option("header", True).csv(temperature_file,schema=schema)
df_temperature_raw.show(5)
df_temperature_raw.count()
df=df_temperature_raw.withColumn("year",year("date"))
df=df.groupBy("year").count().orderBy("year",ascending=False)
df.show(20)
df.printSchema()


+-------------------+-----------+------------------+-----+-------+--------+---------+
|               date|temperature|       uncertainty| city|country|latitude|longitude|
+-------------------+-----------+------------------+-----+-------+--------+---------+
|1743-11-01 00:00:00|      6.068|1.7369999999999999|Århus|Denmark|  57.05N|   10.33E|
|1743-12-01 00:00:00|       null|              null|Århus|Denmark|  57.05N|   10.33E|
|1744-01-01 00:00:00|       null|              null|Århus|Denmark|  57.05N|   10.33E|
|1744-02-01 00:00:00|       null|              null|Århus|Denmark|  57.05N|   10.33E|
|1744-03-01 00:00:00|       null|              null|Århus|Denmark|  57.05N|   10.33E|
+-------------------+-----------+------------------+-----+-------+--------+---------+
only showing top 5 rows

+----+-----+
|year|count|
+----+-----+
|2013|31590|
|2012|42120|
|2011|42120|
|2010|42120|
|2009|42120|
|2008|42120|
|2007|42120|
|2006|42120|
|2005|42120|
|2004|42120|
|2003|42120|
|2002|42120|
|2001|