In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from graphframes import GraphFrame

spark = SparkSession.builder.appName("GraphFlightsAnalysis") \
.config("spark.jars.packages", "graphframes:graphframes:0.8.1-spark2.4-s_2.11") \
.getOrCreate()


In [2]:
df_business = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"')\
.option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true')\
.parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/cleansed_data/business.snappy.parquet') 

In [3]:
df_user = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"')\
.option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true')\
.parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/cleansed_data/user.snappy.parquet') 

In [4]:
df_review = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"')\
.option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true')\
.parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/cleansed_data/review.snappy.parquet') 

In [5]:
restaurant_vertices = df_business.selectExpr('business_id as id', 'b_name', 'b_review_count', 'b_stars', 'b_city', 'null as u_average_stars', 'null as u_useful')
user_vertices = df_user.selectExpr('user_id as id', 'null as b_name', 'null as b_review_count', 'null as b_stars' ,'null as b_city', 'u_average_stars', 'u_useful')
review_edges = df_review.selectExpr('user_id as src', 'business_id as dst', 'r_stars as stars')

In [18]:
restaurant_vertices.show()

[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-------------+
|                  id|              b_name|       b_city|
+--------------------+--------------------+-------------+
|MTSW4McQd7CbVtyjq...|  St Honore Pastries| Philadelphia|
|CF33F8-E6oudUQ46H...|      Sonic Drive-In| Ashland City|
|bBDDEgkFA1Otx9Lfe...|      Sonic Drive-In|    Nashville|
|eEOYSgkmpB90uNA7l...|Vietnamese Food T...|    Tampa Bay|
|il_Ro8jwPlHresjw9...|             Denny's| Indianapolis|
|MUTTqe8uqyMdBl186...|            Tuna Bar| Philadelphia|
|ROeacJQwBeh05Rqg7...|                 BAP| Philadelphia|
|kfNv-JZpuN6TVNSO6...|     Hibachi Express| Indianapolis|
|9OG5YkX1g2GReZM0A...|Romano's Macaroni...|         Reno|
|sqSqqLy0sN8n2IZrA...|      Domino's Pizza|  White House|
|kV_Q1oqis8Qli8dUo...|       Ardmore Pizza|      Ardmore|
|aPNXGTDkf-4bjhyMB...|          Craft Hall| Philadelphia|
|ljxNT9p0y7YMPx0fc...|Tony's Restaurant...|        Alton|
|ABxoFuzZy5mqQ8C5F...|        Core de Roma|  Bala Cynwyd|
|seKihQKpGGnCe

                                                                                

In [19]:
user_vertices.show()



+--------------------+---------------+--------+
|                  id|u_average_stars|u_useful|
+--------------------+---------------+--------+
|--Kwhcbkh7jxkhVVQ...|           3.62|      57|
|--RJK834fiQXm21Vp...|            2.5|       0|
|--UhENQdbuWEh0mU5...|            5.0|       0|
|--bAnPT8W3L01Rg17...|           2.33|       5|
|--cjT1ICjm_ajiwSK...|           3.71|       4|
|--mm7mLpnDS3mNlwu...|            5.0|       2|
|--z9XJZF0T2r7aIsZ...|            4.5|       5|
|-0EzgKMI9ZakqLiWR...|            5.0|       1|
|-1-5YlK9t1Jgp6TbW...|            5.0|       0|
|-1BSu2dt_rOAqllw9...|           4.08|       7|
|-1LmqpHMfQdTe6blE...|           4.58|       8|
|-1OkmOVvocJCWqKKI...|            2.0|       0|
|-1ba3W4bYAMEod5Gf...|           2.25|       8|
|-1uUM_1PrjcHU6DO4...|           4.14|       9|
|-1xqkq7DTbX5Sft3m...|            5.0|       0|
|-2G_a0eur5RTmI-vc...|           3.75|       4|
|-2IwE8_2pTsIKlEjE...|            5.0|       0|
|-2dusr5LyDu6dEeas...|            4.0|  

                                                                                

In [20]:
review_edges.show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+--------------------+-----+
|                 str|                 dst|stars|
+--------------------+--------------------+-----+
|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|
|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|
|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|
|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...|  3.0|
|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...|  4.0|
|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...|  4.0|
|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...|  4.0|
|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...|  5.0|
|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...|  5.0|
|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...|  4.0|
|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...|  5.0|
|WBpQDAZymU0dhIqXc...|ut6fi2W2YaipNOqvi...|  3.0|
|OhECKhQEexFypOMY6...|vC2qm1y3Au5czBtbh...|  4.0|
|zoBajEyVA0z4IjbFs...|c-IgS6Pk6vMyax7Rb...|  4.0|
|clWLI5OZP2ad25ugM...|x4XdNhp0Xn8lOivzc...|  5.0|
|xVKE_HJ2pwUtTdLbL...|S2Ho8yLxhKAa26pBA...|  3.0|
|mNsVyC9tQVYtzLOCb...|MWmXGQ98KbRo3vsS5...|  5.0|


                                                                                

In [None]:
g = GraphFrame(restaurant_vertices.union(user_vertices), review_edges)

In [31]:
## Show vertices
g.vertices.show()

+--------------------+--------------------+-------------+---------------+--------+
|                  id|              b_name|       b_city|u_average_stars|u_useful|
+--------------------+--------------------+-------------+---------------+--------+
|MTSW4McQd7CbVtyjq...|  St Honore Pastries| Philadelphia|           null|    null|
|CF33F8-E6oudUQ46H...|      Sonic Drive-In| Ashland City|           null|    null|
|bBDDEgkFA1Otx9Lfe...|      Sonic Drive-In|    Nashville|           null|    null|
|eEOYSgkmpB90uNA7l...|Vietnamese Food T...|    Tampa Bay|           null|    null|
|il_Ro8jwPlHresjw9...|             Denny's| Indianapolis|           null|    null|
|MUTTqe8uqyMdBl186...|            Tuna Bar| Philadelphia|           null|    null|
|ROeacJQwBeh05Rqg7...|                 BAP| Philadelphia|           null|    null|
|kfNv-JZpuN6TVNSO6...|     Hibachi Express| Indianapolis|           null|    null|
|9OG5YkX1g2GReZM0A...|Romano's Macaroni...|         Reno|           null|    null|
|sqS

In [32]:
## Check the number of edges
g.edges.show()

+--------------------+--------------------+-----+
|                 src|                 dst|stars|
+--------------------+--------------------+-----+
|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|
|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|
|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|
|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...|  3.0|
|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...|  4.0|
|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...|  4.0|
|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...|  4.0|
|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...|  5.0|
|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...|  5.0|
|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...|  4.0|
|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...|  5.0|
|WBpQDAZymU0dhIqXc...|ut6fi2W2YaipNOqvi...|  3.0|
|OhECKhQEexFypOMY6...|vC2qm1y3Au5czBtbh...|  4.0|
|zoBajEyVA0z4IjbFs...|c-IgS6Pk6vMyax7Rb...|  4.0|
|clWLI5OZP2ad25ugM...|x4XdNhp0Xn8lOivzc...|  5.0|
|xVKE_HJ2pwUtTdLbL...|S2Ho8yLxhKAa26pBA...|  3.0|
|mNsVyC9tQVYtzLOCb...|MWmXGQ98KbRo3vsS5...|  5.0|


In [33]:
## Check the number of edges of each vertex
g.degrees.show()



+--------------------+------+
|                  id|degree|
+--------------------+------+
|dMrmnopCJ0xxEuG7W...|    31|
|4P_GCHFMV0mF2SqEW...|     1|
|EMtkhNu-zv7QxokNR...|     2|
|IeSz60ozr1yAVIH8C...|   135|
|9ZSDZGpjz5vhMJ1ck...|     7|
|D3TkaW_qYXRW8lh-U...|     2|
|xHISiVzIR4Bb1YP_S...|    10|
|Zsdee1KEpMwjg5cqx...|    43|
|B9qhGcNrsl81DarqY...|     2|
|eTj0BVNVQD32vNeCz...|     5|
|TuRy46Cyb7MWjV7VM...|    77|
|kALjfwXfePTGEpai5...|     2|
|RlfR0xQJwDpb5qlv0...|    13|
|2y_CdkxEOJEJGyJAp...|   111|
|xW2A0MciHB0pLB4RH...|    11|
|pPDfJ70OUSAqLII7t...|     8|
|KfE7RNRxtK2DWQki_...|     8|
|LnJSsNVZkStgtj86f...|   204|
|RZ-FNTXvqHKngyLGD...|    34|
|p9hNfx2OhQV_b_L6a...|    23|
+--------------------+------+
only showing top 20 rows



                                                                                

In [34]:
#The incoming degree of the vertices
g.inDegrees.show()



+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|TuRy46Cyb7MWjV7VM...|      77|
|2y_CdkxEOJEJGyJAp...|     111|
|LnJSsNVZkStgtj86f...|     204|
|RZ-FNTXvqHKngyLGD...|      34|
|lpbt16sSm4BTcfeq4...|      57|
|I0053JmJ5DEFUWSJ8...|      91|
|ctOOp80WBFPj3wPZy...|      15|
|Y1ZdPCfQ4ndVAmj9l...|      39|
|adATTqggIQX5xxLDI...|      25|
|KBvdN8Apn4DIxuNW3...|      43|
|dE_MaaYrXBAEebtH2...|      13|
|IiatvDg0R1qwPtKPm...|      31|
|vQ5Qa6kvlXiUHT60G...|      84|
|knQ4vIgx-r85kjlWV...|       5|
|nBr6NgqcZz1GhtAng...|      12|
|1dSKEitDDgIkaApe6...|      18|
|kPG6r0h73sPgXBei0...|      90|
|lwdkX7KcibM4mDqpD...|     159|
|YPnksHT2DQA0AhcxP...|      64|
|WDXC3W5lDt2pK2qvw...|     108|
+--------------------+--------+
only showing top 20 rows



                                                                                

In [35]:
#The outgoing degree of the vertices
g.outDegrees.show()



+--------------------+---------+
|                  id|outDegree|
+--------------------+---------+
|3j50d_OtZ5gCTW4uZ...|      109|
|2YjvTiz-CVf4YgQ94...|       23|
|JgMESEXRM4TjGfMWJ...|        1|
|P00DXaSyHhZwe7X_P...|        2|
|7oDpnSUm8RD3m0jwL...|        5|
|mPXK2ZF-2TS8rZRZm...|       33|
|J9YptlPIBvWsH8h3L...|        5|
|417svAEVHreK6c3SK...|      120|
|nuxIF7lJ6IbS9A2mZ...|       23|
|S-gn07vdS1TqsJXWE...|       51|
|nrBnY5Ii7hS2npGSu...|        3|
|neiCw8YK29krBOlSY...|        2|
|FlXBpK_YZxLo27jcM...|      383|
|EgB7lUWHO5kh9oOix...|        1|
|39Xfi7aBX-CQlseGB...|        4|
|4o4VLyI0rumJ5mdt4...|        7|
|FYdno_pQaIhow_H_9...|       22|
|uRWXK5JxpqEBGBX9r...|        1|
|1UlV70-ktCX5qaN7l...|       19|
|u7G3Y9N6LiPG4GmJO...|        8|
+--------------------+---------+
only showing top 20 rows



                                                                                

In [36]:
## Display all connected components
sc = spark.sparkContext
sc.setCheckpointDir('graphframes_cps')
g.connectedComponents().show()

23/05/17 19:46:32 WARN org.apache.spark.SparkContext: Spark is not running in local mode, therefore the checkpoint directory must not be on the local filesystem. Directory 'graphframes_cps' appears to be on the local filesystem.
[Stage 239:>                                                        (0 + 1) / 1]

+--------------------+--------------------+------+---------------+--------+---------+
|                  id|              b_name|b_city|u_average_stars|u_useful|component|
+--------------------+--------------------+------+---------------+--------+---------+
|--RJK834fiQXm21Vp...|                null|  null|            2.5|       0|        0|
|--UhENQdbuWEh0mU5...|                null|  null|            5.0|       0|        0|
|-0EzgKMI9ZakqLiWR...|                null|  null|            5.0|       1|        0|
|-0iIxySkp97WNlwK6...|Truckee Bagel Com...|  Reno|           null|    null|        0|
|-1-5YlK9t1Jgp6TbW...|                null|  null|            5.0|       0|        0|
|-1BSu2dt_rOAqllw9...|                null|  null|           4.08|       7|        0|
|-1ba3W4bYAMEod5Gf...|                null|  null|           2.25|       8|        0|
|-2G_a0eur5RTmI-vc...|                null|  null|           3.75|       4|        0|
|-2IwE8_2pTsIKlEjE...|                null|  null|    

                                                                                

In [37]:
result = g.stronglyConnectedComponents(maxIter=10)
result.select("id", "component").show()

[Stage 558:>                                                        (0 + 1) / 1]]

+--------------------+---------+
|                  id|component|
+--------------------+---------+
|--RJK834fiQXm21Vp...|        0|
|--UhENQdbuWEh0mU5...|        1|
|-0EzgKMI9ZakqLiWR...|        2|
|-0iIxySkp97WNlwK6...|        3|
|-1-5YlK9t1Jgp6TbW...|        4|
|-1BSu2dt_rOAqllw9...|        5|
|-1ba3W4bYAMEod5Gf...|        6|
|-2G_a0eur5RTmI-vc...|        7|
|-2IwE8_2pTsIKlEjE...|        8|
|-3i9bhfvrM3F1wsC9...|        9|
|-43vn4eU9nQbfn4WC...|       10|
|-4eoHNVezSKo9FkrA...|       11|
|-55DgUo52I3zW9Rxk...|       12|
|-7mhLPkdpCQnHBY1r...|       13|
|-9da1xk7zgnnfO1uT...|       14|
|-AKbx6qFJRw0NEcPi...|       15|
|-ARMMpHhZysOGatwe...|       16|
|-Auhegi8KEKW7iR8I...|       17|
|-BEzEb0hVCpq8VJtk...|       18|
|-BIsieBb59f9OBiwf...|       19|
+--------------------+---------+
only showing top 20 rows



                                                                                

In [43]:
pr = g.pageRank(resetProbability=0.15, tol=0.01)## look at the pagerank score for every vertex
pr.vertices.show()

## look at the weight of every edge
pr.edges.show()

                                                                                

+--------------------+--------------------+--------------+-------+------+---------------+--------+------------------+
|                  id|              b_name|b_review_count|b_stars|b_city|u_average_stars|u_useful|          pagerank|
+--------------------+--------------------+--------------+-------+------+---------------+--------+------------------+
|--RJK834fiQXm21Vp...|                null|          null|   null|  null|            2.5|       0|0.5435268678014964|
|--UhENQdbuWEh0mU5...|                null|          null|   null|  null|            5.0|       0|0.5435268678014964|
|-0EzgKMI9ZakqLiWR...|                null|          null|   null|  null|            5.0|       1|0.5435268678014964|
|-0iIxySkp97WNlwK6...|Truckee Bagel Com...|           219|    3.5|  Reno|           null|    null|   37.903650007922|
|-1-5YlK9t1Jgp6TbW...|                null|          null|   null|  null|            5.0|       0|0.5435268678014964|
|-1BSu2dt_rOAqllw9...|                null|          nul



+--------------------+--------------------+-----+-------------------+
|                 src|                 dst|stars|             weight|
+--------------------+--------------------+-----+-------------------+
|-EBZLf7mtJ5MrSvyo...|qXm174C45c8H2Hmxv...|  5.0| 0.3333333333333333|
|-JCk4-TOd9rS3K0jm...|m2gjRN7GHMpdwUkvq...|  3.0|0.06666666666666667|
|-LvyI2zkcNW2A2WDU...|dsfRniRgfbDjC8os8...|  3.0|                0.5|
|-LvyI2zkcNW2A2WDU...|bm-Hrhg0smyGTsSNi...|  1.0|                0.5|
|-PIfi6GsmtslX0Eii...|HTaTuu_35PfpJMKcd...|  1.0|                1.0|
|-delfbMaca8j-Gr1T...|YjQyrXNL5HcFMY6lE...|  5.0| 0.1111111111111111|
|-f7Krw5O07Q77rqhC...|ocdP7OwgygPKPMSqr...|  1.0|                1.0|
|-hnBzgVoRoqLrGVSx...|Hr6aM0s-woIJaW6DJ...|  3.0|            0.03125|
|-sFQA3NMLrO2pb8V_...|ujUb29BOYcUyhMcnq...|  1.0|0.08333333333333333|
|-vJHBYA-VwBQWifih...|_Dr8Bnt8us2qyPKay...|  2.0|0.06666666666666667|
|00Mw0vmBn0EBmEhsy...|uGBwBVsKi1HdO2WXh...|  1.0|                1.0|
|068sIsLsmjtZwfplN..

                                                                                

In [52]:
pr.vertices.filter(col('b_city')=='Nashville').filter(col('b_stars') > 4.5).orderBy('pagerank', ascending=[0])\
.select('id','b_name','b_review_count','pagerank').show(truncate=60)



+----------------------+-----------------------------------------+--------------+------------------+
|                    id|                                   b_name|b_review_count|          pagerank|
+----------------------+-----------------------------------------+--------------+------------------+
|xBPg6gzunjpEUT6FiRrpog|                            Big Al's Deli|           390| 96.62053718894008|
|3NmMVlz6sFsBJsVaPtm12A|                      Tasty And Delicious|           260| 66.88235758022854|
|bZcW_CPll-WaAgciSLPyrA|                  D'Andrews Bakery & Cafe|           212|49.108006933876915|
|ez3QHFQDCbdhzBtU58dbKg|Edessa Restaurant Kurdish Turkish Cuisine|           198| 44.47642390321755|
|PNby7mawC0ecfg-uEp0OVQ|                           Semper Sliders|           152|42.827860620522316|
|wU2yYCoGq1iK2ILZ0dPMzw|                           Veggie Village|           112| 28.60883597544329|
|B9MAfqhVw1AxNzwu2ELPtQ|                                 The Horn|           127| 27.098421

                                                                                

In [6]:
pr.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/model/Graph", format = "parquet", overwrite=True)