In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .getOrCreate()

In [63]:
airport = spark.read.parquet("airport")

AnalysisException: 'Unable to infer schema for Parquet. It must be specified manually.;'

In [5]:
imgr = spark.read.parquet("immigration")

In [35]:
airport.dropna(how="any", subset=["ident","iso_region"])
airport.count()

55075

In [36]:
imgr.dropna(how="any", subset=["addr"])
imgr.count()

3096313

In [8]:
airport.printSchema()

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- continent: string (nullable = true)



In [9]:
imgr.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- res: integer (nullable = true)
 |-- port: string (nullable = true)
 |-- addr: string (nullable = true)
 |-- birth: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- visa: string (nullable = true)
 |-- cit: integer (nullable = true)



In [11]:
airport.select("iso_region").distinct().count()

1045

In [12]:
airport.show(5)

+-----+-------------+--------------------+-----------+----------+---------+
|ident|         type|                name|iso_country|iso_region|continent|
+-----+-------------+--------------------+-----------+----------+---------+
| 01TA|     heliport|Thirty Thirty Mat...|         US|        TX|       NA|
|  05S|small_airport|   Vernonia Airfield|         US|        OR|       NA|
| 09ME|small_airport|Perrotti Skyranch...|         US|        ME|       NA|
| 09NY|     heliport|Spring Lake Fire ...|         US|        NY|       NA|
| 0II9|small_airport|     Winters Airport|         US|        IN|       NA|
+-----+-------------+--------------------+-----------+----------+---------+
only showing top 5 rows



In [65]:
from pyspark.sql.functions import *
airport = airport.withColumn("ident", regexp_replace("ident", "0", ""))

In [46]:
imgr.select("port").orderBy("port").show(20)

+----+
|port|
+----+
| 5KE|
| 5KE|
| 5KE|
| 5T6|
| 5T6|
| 5T6|
| 5T6|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABG|
| ABQ|
| ABQ|
+----+
only showing top 20 rows



In [13]:
imgr.show(5)

+-------+----+-----+---+----+----+-----+------+----+---+
|     id|year|month|res|port|addr|birth|gender|visa|cit|
+-------+----+-----+---+----+----+-----+------+----+---+
|4284274|2016|    4|102| NYC|  NY| 1968|     M|  WB|135|
|  27540|2016|    4|103| LOS|   0| 1981|     M|  B2|135|
|4284275|2016|    4|104| DEN|  CO| 1951|     0|  WB|135|
|  27541|2016|    4|104| CLT|  FL| 1949|     F|  WT|135|
|4284276|2016|    4|104| NYC|  NY| 1943|     F|  WT|135|
+-------+----+-----+---+----+----+-----+------+----+---+
only showing top 5 rows



In [14]:
imgr.select("addr").distinct().count()

458

In [28]:
not_known_cities = (imgr.select("addr").distinct()).subtract(airport.select("iso_region").distinct())

In [31]:
not_known_cities.groupby("addr").count().orderBy("count", ascending=False).show(20)

+----+-----+
|addr|count|
+----+-----+
|  .N|    1|
|  RG|    1|
|  YH|    1|
|  RF|    1|
|  FT|    1|
|  FI|    1|
|  IC|    1|
|  PU|    1|
|  EA|    1|
|  UA|    1|
|  YN|    1|
|  OI|    1|
|  MX|    1|
|  HW|    1|
|  JF|    1|
|  QL|    1|
|  EE|    1|
|  VG|    1|
|  ZN|    1|
|  S6|    1|
+----+-----+
only showing top 20 rows



In [20]:
city = spark.read.parquet("city")

In [21]:
city.show(5)

+----------------+-------------+----------+
|            City|        State|State_Code|
+----------------+-------------+----------+
|   Silver Spring|     Maryland|        MD|
|          Quincy|Massachusetts|        MA|
|          Hoover|      Alabama|        AL|
|Rancho Cucamonga|   California|        CA|
|          Newark|   New Jersey|        NJ|
+----------------+-------------+----------+
only showing top 5 rows



In [24]:
(city.select("State_Code").distinct()).exceptAll(airport.select("iso_region").distinct())

DataFrame[State_Code: string]

In [25]:
city.select("State_Code").distinct().count()

49

In [32]:
city.count()

2891

In [70]:
airport = df_airport

In [71]:
airport.createOrReplaceTempView("airport")
#imgr.createOrReplaceTempView("imgr")

In [42]:
spark.sql("""
    SELECT count(*)
    FROM airport
    UNION
    SELECT count(*)
    FROM imgr
""").show()

+--------+
|count(1)|
+--------+
|   55075|
| 3096313|
+--------+



In [76]:
airport.select("*").where(airport.ident == "5KE").show()

+-----+-------------+--------------------+---------+-----------+----------+
|ident|         type|                name|continent|iso_country|iso_region|
+-----+-------------+--------------------+---------+-----------+----------+
|  5KE|seaplane_base|Ketchikan Harbor ...|       NA|         US|        AK|
+-----+-------------+--------------------+---------+-----------+----------+



In [77]:
imgr.select("*").where(imgr.port == "5KE").show()

+-------+----+-----+---+----+----+-----+------+----+---+
|     id|year|month|res|port|addr|birth|gender|visa|cit|
+-------+----+-----+---+----+----+-----+------+----+---+
|5706064|2016|    4|135| 5KE|  AK| 1965|     M|  B2|135|
|  10721|2016|    4|111| 5KE|  AK| 1950|     F|  B2|111|
|  10722|2016|    4|111| 5KE|  AK| 1953|     M|  B2|111|
+-------+----+-----+---+----+----+-----+------+----+---+



In [108]:
imgr.select("addr").groupby("addr").count().orderBy("count", ascending=False).show(10)

+----+------+
|addr| count|
+----+------+
|  FL|621701|
|  NY|553677|
|  CA|470386|
|  HI|168764|
|   0|152398|
|  TX|134321|
|  NV|114609|
|  GU| 94107|
|  IL| 82126|
|  NJ| 76531|
+----+------+
only showing top 10 rows



In [118]:
import pandas as pd

In [141]:
f = open("country_code.txt", "r")
lines = f.readlines()
df = pd.DataFrame((line.strip().split('=') for line in lines), columns=["code", "name"]).astype({"code": "int32"})
df[["name"]] = df.name.map(lambda x: x.strip().strip("'"))
df_country = spark.createDataFrame(df)
df_country.write.parquet("country", mode="overwrite")

     code                                               name
0     582  MEXICO Air Sea, and Not Reported (I-94, no lan...
1     236                                        AFGHANISTAN
2     101                                            ALBANIA
3     316                                            ALGERIA
4     102                                            ANDORRA
5     324                                             ANGOLA
6     529                                           ANGUILLA
7     518                                    ANTIGUA-BARBUDA
8     687                                         ARGENTINA 
9     151                                            ARMENIA
10    532                                              ARUBA
11    438                                          AUSTRALIA
12    103                                            AUSTRIA
13    152                                         AZERBAIJAN
14    512                                            BAHAMAS
15    298               

In [142]:
df_country.write.parquet("country", mode="overwrite")

In [143]:
df_country.createOrReplaceTempView("country")

In [156]:
df_q1 = spark.sql("""
    SELECT visa, name, count(*) count
    FROM imgr
    JOIN country
    ON res=code
    GROUP BY visa, name
    UNION
    SELECT visa, '-total-', count(*) count
    FROM imgr
    JOIN country
    ON res=code
    GROUP BY visa
    ORDER BY visa, count DESC
    LIMIT 20
""")

In [157]:
df_q1.write.parquet("country_from_by_visa", mode="overwrite")

In [322]:
imgr.select("visa").dropDuplicates().show(10)

Py4JJavaError: An error occurred while calling o4853.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 944.0 failed 1 times, most recent failure: Lost task 0.0 in stage 944.0 (TID 60734, localhost, executor driver): java.io.FileNotFoundException: File file:/home/workspace/immigration/cit=135/part-00000-30214137-f510-4b7d-be0a-ab212b51fa4d.c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor69.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: File file:/home/workspace/immigration/cit=135/part-00000-30214137-f510-4b7d-be0a-ab212b51fa4d.c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.scan_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [158]:
city.createOrReplaceTempView("city")

In [208]:
df_q2 = spark.sql("""
    SELECT 
        case when year-birth < 20 then 'under 20'
        when year-birth >=20 and year-birth <=25 then '20-25'
        when year-birth >=26 and year-birth <=40 then '26-40'
        when year-birth >=41 and year-birth <=60 then '41-60'
        when year-birth > 60 then 'above 60' end as Group,
        State, count(*) count, age.Median_age
    FROM imgr
    JOIN (SELECT State, State_Code, cast(avg(Median_age) as int) Median_age
            FROM city
            JOIN age
            ON city.city=age.city
            GROUP BY State, State_Code) age
    ON addr=state_code
    GROUP BY Group, State, Median_age
    ORDER BY Group, count DESC
""")

In [214]:
df_q2.write.parquet("age_by_state_and_immigrants_age", mode="overwrite")

In [167]:
age = spark.read.parquet("age")
age.createOrReplaceTempView("age")

In [253]:
df_q3 = spark.sql("""
    SELECT name as Country_from, t1.State as State_to, count(*) count, t2.Race, t2.Count as Race_Count
    FROM imgr
    JOIN (SELECT DISTINCT State, State_Code FROM city) t1
    ON addr=state_code
    JOIN country
    ON res=code
    JOIN (SELECT city.State, Race, ROUND(100*sum(Count)/t3.total, 2) Count
        FROM race
        JOIN city
        on race.city=city.city
        JOIN (SELECT State, sum(Count) total
        FROM race
        JOIN city
        on race.city=city.city
        GROUP BY State) t3
    ON city.State = t3.State
    GROUP BY city.State, Race, t3.total) t2
    ON t1.State=t2.State
    GROUP BY name, t1.State, t2.Race, t2.Count
    ORDER BY name, count DESC, Race_count DESC
    LIMIT 10
""")
df_q3.write.parquet("race_by_state")

In [229]:
race.createOrReplaceTempView("race")

In [266]:
spark.sql("""
    SELECT city.state, city.state_code, sum(foreign.Foreign_Born) Foreign_Born, sum(population.total) total
            FROM foreign
            JOIN city
            ON city.city=foreign.city
            JOIN population
            ON city.city=population.city
            GROUP BY city.state, city.state_code
""").show()

+--------------+----------+------------+--------+
|         state|state_code|Foreign_Born|   total|
+--------------+----------+------------+--------+
|   Mississippi|        MS|       10844|  547454|
|          Utah|        UT|      132819| 1050591|
|  South Dakota|        SD|       15309|  245098|
|      Kentucky|        KY|       66488|  929877|
|    California|        CA|     8203246|28009912|
|      Nebraska|        NE|       71221|  721233|
| New Hampshire|        NH|       27199|  198198|
|      Delaware|        DE|       21474|  375808|
|     Minnesota|        MN|      342522| 2613437|
|North Carolina|        NC|      677624| 5836143|
|        Nevada|        NV|      481337| 2240744|
|    Washington|        WA|      440962| 2500107|
|     Louisiana|        LA|      100209| 1570596|
|         Idaho|        ID|       28126|  398883|
|    New Mexico|        NM|       89112|  839042|
|         Maine|        ME|      190540| 1398118|
|     Tennessee|        TN|      187080| 2561162|


In [262]:
population.createOrReplaceTempView("population")

In [269]:
df_q4 = spark.sql("""
    SELECT t1.State, round(100*Foreign_Born/total, 2) Foreign_Born, count(*) as count
    FROM (SELECT city.state, city.state_code, sum(foreign.Foreign_Born) Foreign_Born, sum(population.total) total
            FROM foreign
            JOIN city
            ON city.city=foreign.city
            JOIN population
            ON city.city=population.city
            GROUP BY city.state, city.state_code) t1
    JOIN imgr
    ON imgr.addr=t1.State_Code
    GROUP BY t1.State, Foreign_Born, total
    ORDER BY count DESC
""")
df_q4.write.parquet("foreign_born_by_state")

In [271]:
imgr.printSchema()
city.printSchema()
age.printSchema()
race.printSchema()
population.printSchema()
foreign.printSchema()
df_country.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- res: integer (nullable = true)
 |-- port: string (nullable = true)
 |-- addr: string (nullable = true)
 |-- birth: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- visa: string (nullable = true)
 |-- cit: integer (nullable = true)

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- State_Code: string (nullable = true)

root
 |-- City: string (nullable = true)
 |-- Median_age: double (nullable = true)

root
 |-- City: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Count: integer (nullable = true)

root
 |-- City: string (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Female: integer (nullable = true)
 |-- total: integer (nullable = true)

root
 |-- City: string (nullable = true)
 |-- Foreign_Born: integer (nullable = true)

root
 |-- code: long (nullable = true)
 |-- name: string (nullab

In [275]:
df_q1.printSchema()
df_q1.limit(5).toPandas()

root
 |-- visa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- count: long (nullable = false)



Unnamed: 0,visa,name,count
0,B1,-total-,212410
1,B1,"CHINA, PRC",32284
2,B1,"MEXICO Air Sea, and Not Reported (I-94, no lan...",31031
3,B1,INDIA,22485
4,B1,BRAZIL,14044


In [274]:
df_q2.printSchema()
df_q2.limit(5).toPandas()

root
 |-- Group: string (nullable = true)
 |-- State: string (nullable = true)
 |-- count: long (nullable = false)
 |-- Median_age: integer (nullable = true)



Unnamed: 0,Group,State,count,Median_age
0,20-25,New York,41284,35
1,20-25,Florida,33831,39
2,20-25,California,33033,36
3,20-25,Hawaii,8225,41
4,20-25,Texas,6191,33


In [276]:
df_q3.printSchema()
df_q3.limit(5).toPandas()

root
 |-- Country_from: string (nullable = true)
 |-- State_to: string (nullable = true)
 |-- count: long (nullable = false)
 |-- Race: string (nullable = true)
 |-- Race_Count: double (nullable = true)



Unnamed: 0,Country_from,State_to,count,Race,Race_Count
0,AFGHANISTAN,California,34,White,47.48
1,AFGHANISTAN,California,34,Hispanic or Latino,30.73
2,AFGHANISTAN,California,34,Asian,13.83
3,AFGHANISTAN,California,34,Black or African-American,6.7
4,AFGHANISTAN,California,34,American Indian and Alaska Native,1.26


In [278]:
df_q4.printSchema()
df_q4.limit(5).toPandas()

root
 |-- State: string (nullable = true)
 |-- Foreign_Born: double (nullable = true)
 |-- count: long (nullable = false)



Unnamed: 0,State,Foreign_Born,count
0,Florida,22.85,621701
1,New York,33.41,553677
2,California,29.29,470386
3,Hawaii,28.72,168764
4,Texas,20.79,134321


In [281]:
imgr.select("id").groupby("id").count().filter(col("count") > 1).count()

0

In [285]:
age.select("City").groupby("City").count().filter(col("count") > 1).show()

+------------+-----+
|        City|count|
+------------+-----+
| Springfield|    3|
|Fayetteville|    2|
|       Allen|    2|
|      Albany|    2|
|    Portland|    2|
|      Aurora|    2|
|    Columbus|    2|
|    Pasadena|    2|
|    Glendale|    2|
|    Lakewood|    2|
|  Wilmington|    2|
| Bloomington|    3|
| Westminster|    2|
|   Lafayette|    2|
|   Arlington|    2|
|     Jackson|    2|
|   Rochester|    2|
| Kansas City|    2|
|      Peoria|    2|
|Jacksonville|    2|
+------------+-----+
only showing top 20 rows



In [292]:
max_age = age.select("City", "Median_age").groupby("City").agg(max("Median_age"))

In [293]:
max_age.select("City").groupby("City").count().filter(col("count") > 1).count()

0

In [294]:
population.select("City").groupby("City").count().filter(col("count") > 1).count()

26

In [295]:
population.select("City").groupby("City").count().filter(col("count") > 1).show()

+------------+-----+
|        City|count|
+------------+-----+
| Springfield|    3|
|Fayetteville|    2|
|       Allen|    2|
|    Portland|    2|
|      Albany|    2|
|      Aurora|    2|
|    Columbus|    2|
|    Pasadena|    2|
|    Glendale|    2|
|    Lakewood|    2|
|  Wilmington|    2|
| Westminster|    2|
| Bloomington|    3|
|   Lafayette|    2|
|   Arlington|    2|
|     Jackson|    2|
|   Rochester|    2|
| Kansas City|    2|
|      Peoria|    2|
|Jacksonville|    2|
+------------+-----+
only showing top 20 rows



In [312]:
population.select("*").where(col("City") == "Fayetteville").show()

+------------+------+------+------+
|        City|  Male|Female| total|
+------------+------+------+------+
|Fayetteville| 41959| 40873| 82832|
|Fayetteville|101051|100914|201965|
+------------+------+------+------+



In [298]:
max_population = population.groupby("City").agg(max("Male"), max("Female"), max("total"))
max_population.select("City").groupby("City").count().filter(col("count") > 1).count()

0

In [301]:
df_country.select("Code").groupby("Code").count().filter(col("count") > 1).count()

0

In [307]:
foreign.select("City").groupby("City").count().filter(col("count") > 1).count()

26

In [308]:
race.select("City").groupby("City").count().filter(col("count") > 1).count()

565

In [309]:
foreign.select("*").where(col("City") == "Springfield").show()

+-----------+------------+
|       City|Foreign_Born|
+-----------+------------+
|Springfield|        4264|
|Springfield|        7765|
|Springfield|       16226|
+-----------+------------+



In [313]:
race.select("*").where(col("City") == "Fayetteville").orderBy("Race").show()

+------------+--------------------+------+
|        City|                Race| Count|
+------------+--------------------+------+
|Fayetteville|American Indian a...|  6603|
|Fayetteville|American Indian a...|  2058|
|Fayetteville|               Asian|  8949|
|Fayetteville|               Asian|  4707|
|Fayetteville|Black or African-...|  6927|
|Fayetteville|Black or African-...| 90625|
|Fayetteville|  Hispanic or Latino|  5535|
|Fayetteville|  Hispanic or Latino| 25080|
|Fayetteville|               White|102075|
|Fayetteville|               White| 68830|
+------------+--------------------+------+



In [317]:
min_race = race.groupby("City", "Race").agg(min("Count"))
min_race.select("City", "Race").groupby("City", "Race").count().filter(col("count") > 1).count()

0

In [319]:
min_race.select("*").where(col("City") == "Springfield").show(5)

+-----------+--------------------+----------+
|       City|                Race|min(Count)|
+-----------+--------------------+----------+
|Springfield|               White|     90935|
|Springfield|               Asian|      3871|
|Springfield|Black or African-...|     10026|
|Springfield|American Indian a...|      1602|
|Springfield|  Hispanic or Latino|      2738|
+-----------+--------------------+----------+

