In [1]:
import os

from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from itertools import combinations

execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))
graphPath = "/data/graphDFSample"

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
def create_pairs(l):
    return list(combinations(sorted(l), 2))

pair_schema = StructType([
    StructField("user1", IntegerType(), False),
    StructField("user2", IntegerType(), False)
])

udf_create_pairs = udf(create_pairs, ArrayType(pair_schema))

In [4]:
results = sparkSession.read.parquet(graphPath) \
    .withColumn("friend", explode('friends')) \
    .groupBy("friend") \
    .agg(collect_list("user").alias("users")) \
    .withColumn("users_size", size("users")) \
    .filter(col("users_size") > 1) \
    .withColumn("pairs", udf_create_pairs('users')) \
    .withColumn("pair", explode('pairs')) \
    .groupBy("pair") \
    .agg(count("pair").alias("pair_count")) \
    .select(col("pair_count"), "pair.*") \
    .orderBy(desc("pair_count"), desc("user1"), desc("user2")) \
    .limit(49) \
    .collect()

In [5]:
for val in results:
    print '%s %s %s ' % val

3206 27967558 42973992 
3130 20158643 42973992 
3066 22582764 42973992 
3044 21864412 51640390 
3021 17139850 51640390 
3010 14985079 51640390 
2970 17139850 21864412 
2913 20158643 27967558 
2903 22280814 51151280 
2870 23848749 51640390 
2855 20158643 22582764 
2849 20158643 44996025 
2846 22280814 42973992 
2784 21864412 23848749 
2779 31964081 51640390 
2776 39205988 51640390 
2754 17139850 23848749 
2749 22582764 27967558 
2728 50561859 51640390 
2724 15485897 51640390 
2700 28135661 42973992 
2655 22280814 27967558 
2653 42973992 43548989 
2639 26755857 51640390 
2621 14635589 51640390 
2608 15485897 17139850 
2606 17139850 26755857 
2601 21864412 39205988 
2600 8406745 51640390 
2599 37735419 51640390 
2597 20158643 28135661 
2585 40003405 42973992 
2585 21864412 31964081 
2581 27967558 43548989 
2579 23848749 31964081 
2578 27967558 28135661 
2578 15485897 21864412 
2577 42973992 64755069 
2574 51151280 57077210 
2573 20158643 43548989 
2566 21864412 26755857 
2564 22280814 647