In [3]:
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         .appName('SparkSql')
         .getOrCreate())

In [65]:
file =  '../Rural_Atlas_Update20/People.csv'
# columns = !powershell cat {file} -First 1
# for line in columns[0].split(','):
#     print(line)

In [39]:
people_df = spark.read.csv(file, inferSchema=True, header=True)
#people_df.show(1)
pop_df_2010 = people_df.select(
    'state', 'county','TotalPopEst2010','WhiteNonHispanicPct2010','HispanicPct2010', 'AsianNonHispanicPct2010')
pop_df_2010.show(5)

+-----+-------------+---------------+-----------------------+---------------+-----------------------+
|state|       county|TotalPopEst2010|WhiteNonHispanicPct2010|HispanicPct2010|AsianNonHispanicPct2010|
+-----+-------------+---------------+-----------------------+---------------+-----------------------+
|   US|United States|      309338421|                  63.75|          16.35|                   4.69|
|   AL|      Alabama|        4785448|                  67.04|           3.88|                   1.11|
|   AL|      Autauga|          54754|                  77.25|            2.4|                   0.86|
|   AL|      Baldwin|         183111|                   83.5|           4.38|                   0.74|
|   AL|      Barbour|          27330|                  46.75|           5.05|                   0.39|
+-----+-------------+---------------+-----------------------+---------------+-----------------------+
only showing top 5 rows



# SaveAsTable and load in sql 

In [41]:
pop_df_2010. write.saveAsTable('2010population', mode='overwrite')

In [43]:
sql_df = spark.sql('select * from 2010population')
sql_df.show(5)

+-----+-------------+---------------+-----------------------+---------------+-----------------------+
|state|       county|TotalPopEst2010|WhiteNonHispanicPct2010|HispanicPct2010|AsianNonHispanicPct2010|
+-----+-------------+---------------+-----------------------+---------------+-----------------------+
|   US|United States|      309338421|                  63.75|          16.35|                   4.69|
|   AL|      Alabama|        4785448|                  67.04|           3.88|                   1.11|
|   AL|      Autauga|          54754|                  77.25|            2.4|                   0.86|
|   AL|      Baldwin|         183111|                   83.5|           4.38|                   0.74|
|   AL|      Barbour|          27330|                  46.75|           5.05|                   0.39|
+-----+-------------+---------------+-----------------------+---------------+-----------------------+
only showing top 5 rows



# Temporary View

In [44]:
pop_df_2010.createOrReplaceTempView('pop2010_df_temp_view')

### FInd counties with more hispanic people than white

In [47]:
spark.sql(' SELECT state, \
          county, \
          WhiteNonHispanicPct2010,  \
          HispanicPct2010 ,\
          HispanicPct2010/TotalPopEst2010 AS hispanic_ratio \
          FROM pop2010_df_temp_view \
          WHERE HispanicPct2010>WhiteNonHispanicPct2010 \
          ORDER BY hispanic_ratio DESC').show(10)

+-----+---------+-----------------------+---------------+--------------------+
|state|   county|WhiteNonHispanicPct2010|HispanicPct2010|      hispanic_ratio|
+-----+---------+-----------------------+---------------+--------------------+
|   TX|   Kenedy|                  20.67|          76.68| 0.18388489208633096|
|   PR|  Culebra|                    7.7|          91.75| 0.05041208791208791|
|   TX|Culberson|                  21.02|          76.19|0.031693011647254576|
|   TX|  Edwards|                   47.3|           51.3|0.025662831415707854|
|   TX| Hudspeth|                  18.07|          79.63|0.022934907834101383|
|   CO| Costilla|                  30.82|          66.03|0.018710682913006517|
|   TX|   Reagan|                   36.2|          60.91|0.018165821652251713|
|   TX| Jim Hogg|                    6.3|          92.58|0.017507564296520423|
|   TX| Crockett|                  35.28|          63.24| 0.01708727370980816|
|   NM|Guadalupe|                  16.07|          7

### Find states with more hispanic people than white in 2010

#### pyspark

In [68]:
from pyspark.sql.functions import *

In [79]:
(sql_df.select('state', 'county', 'TotalPopEst2010',
              (col('TotalPopEst2010')*col('HispanicPct2010')/100).alias('TotalHispanic') ,
             (col('TotalPopEst2010')*col('WhiteNonHispanicPct2010')/100).alias('TotalWhiteNonHispanic'))
 .groupBy('state').agg(
     sum('TotalPopEst2010').alias('TotalPopEst2010'),
     sum('TotalHispanic').alias('TotalHispanic'),
     sum('TotalWhiteNonHispanic').alias('TotalWhiteNonHispanic'),
     )
 .select('state','TotalPopEst2010',
         (col('TotalHispanic')/col('TotalPopEst2010')*100).alias('HispanicPct'),
        (col('TotalWhiteNonHispanic')/col('TotalPopEst2010')*100).alias('WhiteNonHispanicPct'))
 .where(col('HispanicPct')> col('WhiteNonHispanicPct'))
 .sort('HispanicPct',ascending=False)
).show(5)

+-----+---------------+-----------------+-------------------+
|state|TotalPopEst2010|      HispanicPct|WhiteNonHispanicPct|
+-----+---------------+-----------------+-------------------+
|   PR|        7443050|98.99912663088385|   0.72163327399386|
|   NM|        4129176|46.29742528775718|  40.49270082941487|
+-----+---------------+-----------------+-------------------+



### SQL