In [1]:
sc

<pyspark.context.SparkContext at 0x109b89a50>

In [2]:
import itertools
from math import sqrt
from pyspark import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp, to_date, col, count, \
                                    to_utc_timestamp, desc, datediff, lit, round
    
from pyspark.sql.functions import substring, to_date, length, udf, log, exp, avg
import pyspark.sql.functions as func
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import random
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
# Machine learning imports
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.stat import Statistics
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, PolynomialExpansion, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors, Vector
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
input_file = sc.textFile('friendsnetwork.csv')

In [5]:
friends_list = input_file.map(lambda line: line.rstrip(', ')) \
                        .map(lambda line: line.split(',')) \
                        .map(lambda friendList: (friendList[0], friendList[1:]))

In [6]:
def computePairsToRecommend(p_friendList):
    user = p_friendList[0]
    user_friends = list(set(p_friendList[1]))
    
    l_retList = [((friend, user), 0) if user > friend else ((user, friend), 0)\
                 for friend in user_friends]
    
    for pair in itertools.combinations(user_friends, 2):
        if pair[0] > pair[1]:
            l_retList.append(((pair[1], pair[0]), 1))
        else:    
            l_retList.append((pair, 1))
    return l_retList

In [7]:
friend_pairs = friends_list.flatMap(lambda userPair: computePairsToRecommend(userPair))
friend_pairs.cache()

PythonRDD[2] at RDD at PythonRDD.scala:48

In [8]:
recommendation_list = friend_pairs.groupByKey().filter(lambda rec: 0 not in rec[1]) \
                                                .map(lambda x: (x[0], sum(x[1])))
recommendation_list.cache()

PythonRDD[7] at RDD at PythonRDD.scala:48

In [13]:
recommendation_list.sortBy(lambda rec: rec[1], ascending=False).take(10)

[((u'Julian Woods', u'Melinda Weeks'), 3),
 ((u'Benjamin Hensley', u'Lewis Currin'), 3),
 ((u'Bob Fischer', u'Stephanie Hawkins'), 3),
 ((u'Jay Wang', u'Marion Puckett'), 3),
 ((u'Jamie Gallagher', u'Theodore McKay'), 3),
 ((u'Audrey Field', u'Audrey Lanier'), 2),
 ((u'Jessica Hester', u'Suzanne Bowden'), 2),
 ((u'Kristine Dougherty', u'Terry Barton'), 2),
 ((u'Curtis Proctor', u'Gordon Berman'), 2),
 ((u'Anna Sawyer', u'Brent Fletcher'), 2)]

In [10]:
max_friends = 5
final_list = recommendation_list.flatMap(lambda rec: [(rec[0][0], (rec[0][1], rec[1])), \
                                                (rec[0][1], (rec[0][0], rec[1]))])
output = final_list.groupByKey().mapValues(lambda rec: \
                                           sorted(rec, key=lambda x: x[1], reverse=True)[:max_friends])

In [12]:
output.take(5)

[(u'Bruce Wiley',
  [(u'Patsy Parrott', 2),
   (u'Russell Dalton', 2),
   (u'Steven Ballard', 2),
   (u'Carl Nixon', 1),
   (u'Sheryl Love', 1)]),
 (u'Sherry Bray',
  [(u'Suzanne Gould', 1),
   (u'Peter Hughes', 1),
   (u'Henry Dickson', 1),
   (u'Lester Rich', 1),
   (u'Sandra Grossman', 1)]),
 (u'Helen Jordan',
  [(u'Beth Gonzalez', 1),
   (u'Carolyn McConnell', 1),
   (u'Bernard Bowers', 1),
   (u'Gilbert Berger', 1),
   (u'Claude Nixon', 1)]),
 (u'Jan Brown',
  [(u'Patsy Parrott', 1),
   (u'Alice Eaton', 1),
   (u'Sherri Pate', 1),
   (u'Bruce Wiley', 1),
   (u'Joanne Katz', 1)]),
 (u'Cynthia Mack',
  [(u'Florence McFarland', 1),
   (u'Melinda Proctor', 1),
   (u'Steven Boyd', 1),
   (u'Megan McPherson', 1),
   (u'Catherine Garcia', 1)])]