In [1]:
# !pip install pyspark
# !pip install -U -q PyDrive
# !apt install openjdk-8-jdk-headless -qq

In [2]:
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Spark Library
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [4]:
import re
import sys
import itertools as it

In [5]:
"""
Initializing Spark and preparation for data analysis
"""

conf = SparkConf()
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()



In [6]:
# # Google drive mount
# from google.colab import drive
# drive.mount('/content/drive')

In [7]:
friendLines = sc.textFile('./soc-LiveJournal1Adj.txt') 
# Your folder path

In [8]:
user_friends = friendLines.map(lambda l: l.split())

In [9]:
user  = user_friends.map(lambda x: x[0])
Users = user.collect()
Users = map(int, Users)

In [10]:
user_w_friends = user_friends.filter(lambda e: len(e) == 2)

In [11]:
user_friendList = user_w_friends.map(lambda p: (int(p[0]), map(int, sorted(p[1].split(',')))))

In [12]:
def friend_pairs_of_user(user_friendslist):
	"""
	pairing user and friends.
	"""
	from_user = user_friendslist[0]
	friendLst = user_friendslist[1]
	return [(pair_of_usersFriend, 1) for pair_of_usersFriend in it.combinations(friendLst, 2)]

In [13]:
pairs_from_commonFriend = user_friendList.map(friend_pairs_of_user).flatMap(lambda x: x)

In [14]:
pairs_CntOf_commonFriends = pairs_from_commonFriend.reduceByKey(lambda c1, c2: c1 + c2) 
# Transformation. Refer to this site if you don't know "reduceByKey" -> "https://seamless.tistory.com/101" (출처 :tistory blog, "reduceByKey")

In [15]:
user2friendsMap = user_friendList.collectAsMap() 
# Create a key-value RDD and convert it to a Scala Map

In [16]:
pairs_hasCommon_yetFriends = pairs_CntOf_commonFriends.filter(lambda pC: pC[0][1] not in user2friendsMap[pC[0][0]])

In [17]:
user_recommendList_byShareCnt = pairs_hasCommon_yetFriends.map(lambda pC: [(pC[0][0], {pC[1]: [pC[0][1]]}), (pC[0][1], {pC[1]: [pC[0][0]]})]).flatMap(lambda x: x)

In [18]:
def mergeByShareCnt(d1, d2):
	"""
	merge in the same index (between d1 and d2)
	"""
	for k2 in d2:
		if k2 in d1:
			d1[k2] += d2[k2]
		else:
			d1[k2] = d2[k2]
	return d1

In [19]:
recommendList_byShareCnt = user_recommendList_byShareCnt.reduceByKey(mergeByShareCnt)

In [20]:
def sortRecommendsByCnt(recommendTuple):
	"""
	simply sort by cnt
	"""
	rlist = []
	rdict = recommendTuple[1]
	ckeys = sorted(rdict, reverse=True)
	for cnt in ckeys:
	    rlist.append((cnt, sorted(rdict[cnt])))
	return (recommendTuple[0], rlist)

In [21]:
recommends = recommendList_byShareCnt.map(sortRecommendsByCnt)

In [22]:
def recList(pair):
  person = pair[0]
  cntfList = pair[1]
  cntfLen = len(cntfList)
  rlist = []

  for i in range(cntfLen):
    for j in range(len(cntfList[i][1])):
      rlist.append(cntfList[i][1][j])
  return (person, rlist)

In [23]:
recommendList = recommends.map(recList)

In [24]:
recommendMap = recommendList.collectAsMap()

In [25]:
def pMap2file(pMap, pList, N, fname):
  fp = open(fname, 'w+')
  for k in pList:
    if k in pMap:
      plen = np.min([len(pMap[k]), N])
      s = ','.join(str(e) for e in pMap[k][:plen])
      write_doc = str(k) + "\t" + str(s)
      fp.write(write_doc + "\n")
    else:
      fp.write(str(k))
  fp.close()

In [26]:
uList = [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993]
pMap2file(recommendMap, uList, 10, 'select_out.txt')

sc.stop()