In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 30 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya Link Prediction app") 

spark = SparkSession.builder.config(conf=conf).appName("natasha pritykovskaya Spark Dataframe app").getOrCreate()

In [3]:
spark

In [4]:
graphPath = "/lectures/lecture04/trainGraph"
usersToPredictPath = "/lectures/lecture04/prediction.csv"

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType(fields=[
    StructField("user", IntegerType()),
    StructField("friendsString", StringType())
])

data = spark.read.format("csv") \
        .schema(schema) \
        .option("delimiter", "\t") \
        .load(graphPath) 

In [6]:
data.show(5)

+----+--------------------+
|user|       friendsString|
+----+--------------------+
|1654|{(14082,0),(27448...|
|3030|{(3050,0),(5466,0...|
|4150|{(4698,0),(48091,...|
|5750|{(1600,1024),(426...|
|5942|{(11729,0),(53391...|
+----+--------------------+
only showing top 5 rows



In [8]:
data.rdd.getNumPartitions()

18

In [9]:
from pyspark.sql.functions import col, explode, collect_list, sort_array, size, split, lit


In [18]:
from pyspark.sql.functions import pandas_udf

def cutStartEndBrackets(series):
    return series.str[2:-2]

cutStartEndBracketsUDF = pandas_udf(cutStartEndBrackets, StringType())

userFriend = \
    data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("friend", split(col("friendMask"), ",")[0])\
    .select(col("user"), col("friend").cast("integer"))

<img src="pics/step1.png" width=700/>

In [19]:
usersWithCommonFriend = userFriend\
    .groupBy("friend")\
    .agg(collect_list("user").alias("usersWithCommonFriend")) \
    .select("usersWithCommonFriend")\
    .where(size(col("usersWithCommonFriend")) >= 2)\
    .select(sort_array("usersWithCommonFriend").alias("sortedUsersWithCommonFriend"))\
    .drop("usersWithCommonFriend")

<img src="pics/step2.png" width=700/>
<img src="pics/step3.png" width=700/>


In [20]:
usersWithCommonFriend.show()

+---------------------------+
|sortedUsersWithCommonFriend|
+---------------------------+
|          [131318, 2038934]|
|       [11979968, 11979968]|
|       [4471643, 4864911...|
|           [39812, 3996243]|
|       [34660, 960407, 1...|
|         [215426, 10989590]|
|       [294833, 3541819,...|
|           [585940, 595464]|
|         [7941296, 7941296]|
|       [213875, 3310959,...|
|       [1406846, 2044978...|
|       [1208384, 1208384...|
|        [2625623, 16545603]|
|        [9787095, 11510201]|
|       [1093525, 1543499...|
|       [10599936, 10599936]|
|       [5021715, 5262480...|
|        [1651903, 10655397]|
|       [1919, 582013, 35...|
|         [3243642, 5845843]|
+---------------------------+
only showing top 20 rows



In [21]:
schema_users_to_pred = StructType(fields=[
    StructField("user", IntegerType()),
])

usersToPredict = spark.read.format("csv") \
    .schema(schema) \
    .load(usersToPredictPath) \
    .select(col("user").cast("integer")) \
    .rdd.map(lambda t : t.user).collect()

usersToPredictBC = spark.sparkContext.broadcast(set(usersToPredict))

In [36]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType

def pairsWithCommonFriend(usersWithCommonFriend):
    pairs = []
    for user1Index in range(0, len(usersWithCommonFriend)):
        for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
            if user1Index != user2Index:
                if (usersWithCommonFriend[user1Index] in usersToPredictBC.value or \
                usersWithCommonFriend[user2Index] in usersToPredictBC.value):
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
    return pairs

schema = ArrayType(ArrayType(IntegerType()))

pairsWithCommonFriendUdf = udf(pairsWithCommonFriend, schema)

<img src="pics/step4_2.png" width=700/>

In [37]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

def pairsWithCommonFriend(series):
    pairs_lists = []
    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
            for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                if user1Index != user2Index:
                    if usersWithCommonFriend[user1Index] in usersToPredictBC.value or \
                    usersWithCommonFriend[user2Index] in usersToPredictBC.value:
                        pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)
        
pairsWithCommonFriendUdf = pandas_udf(pairsWithCommonFriend, schema)

commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdf("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0)    

commonFriendsCounts\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(20)

+--------------------+-----+
|pairWithCommonFriend|count|
+--------------------+-----+
|  [1674438, 3595421]|    2|
|   [364107, 2001627]|    1|
|  [1041597, 1601542]|   18|
|  [1987589, 2390769]|    2|
| [2001627, 13719295]|    1|
| [5673646, 13719295]|    1|
|[14510074, 16471947]|    3|
| [9244264, 10688913]|    1|
|  [2912872, 7445742]|  236|
| [2894369, 11791325]|   22|
|[11791325, 13363661]|   12|
|  [3056954, 4206583]|   14|
|  [3279070, 9582976]|  120|
| [4483293, 12881491]|    5|
|[10063577, 13662697]|    4|
|[11906845, 14021773]|   16|
|    [18300, 1188271]|   14|
|  [2428246, 2944744]|    2|
|   [998048, 3392160]|   48|
| [3121147, 16614662]|    9|
+--------------------+-----+
only showing top 20 rows



In [None]:
def pairsWithCommonFriend(series):
    pairs_lists = []
    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
            for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                if user1Index != user2Index:
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)
         
pairsWithCommonFriendUdf = pandas_udf(pairsWithCommonFriend, schema_pandas)

commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdf("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0)    

commonFriendsCounts\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(20)

In [47]:
from functools import partial

def pairsWithCommonFriendUpgraded(series, modulo):
    pairs_lists = []

    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
             for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                    if user1Index != user2Index and user1Index % 13 == modulo:
                        pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)


for i in range(13):
    pairsWithCommonFriendUdfUpgraded = pandas_udf(partial(pairsWithCommonFriendUpgraded, modulo=i), schema)

    commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdfUpgraded("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0)\
            .write.parquet("pairs/" + str(i), mode = "overwrite")

In [48]:
spark.read.parquet("pairs/*").count()

16229065

In [49]:
spark.read.parquet("pairs/0").count()

4565054

In [27]:
spark.read.parquet("pairs/*")\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(5)

+--------------------+-----+
|pairWithCommonFriend|count|
+--------------------+-----+
| [1481111, 15756137]|    1|
| [8039479, 10812715]|    4|
| [9102327, 16272182]|    1|
|   [709520, 1563367]|    2|
|    [200736, 228410]|    2|
+--------------------+-----+
only showing top 5 rows



In [None]:
spark.stop()