In [None]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("NBA Shot Logs").setMaster("yarn")
sc = SparkContext(conf=conf)
`
data = sc.textFile("/input/shot_logs.csv")


#### 1. Who is his ”most unwanted defender”?

In [None]:
player_pairs = data.map(lambda line: ( (line.split(",")[20], line.split(",")[16]), (int(line.split(",")[18]), 1) ))
player_stats = player_pairs.reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
player_hit_rate = player_stats.mapValues(lambda x: x[0]/x[1])

most_unwanted_defenders = player_hit_rate.groupBy(lambda x: x[0][0]).mapValues(lambda x: sorted(x, key=lambda y: y[1])[0][0][1])

for player, defender in most_unwanted_defenders.collect():
    print("Player {}: most unwanted defender is {}".format(player, defender))


#### 2. Classify each player’s records into 4 comfortable zones

In [None]:
player_zones = data.map(lambda line: ( line.split(",")[20], (float(line.split(",")[11]), float(line.split(",")[16]), float(line.split(",")[8]), int(line.split(",")[19]), 1) ))

player_zone_stats = player_zones.groupBy(lambda x: (x[0], int(x[1][0]), int(x[1][1]), int(x[1][2]))).mapValues(lambda x: (sum([t[1] for t in x]), sum([t[1][0] for t in x])))

# classifies each player's records into 4 comfortable zones based on shot distance, closest defender distance, and shot clock time
player_zones_classified = player_zone_hit_rate.map(lambda x: (x[0][0], (x[0][1], x[0][2], x[0][3], x[1]))).groupBy(lambda x: x[0]).mapValues(lambda x: sorted(x, key=lambda y: y[1][3], reverse=True)).mapValues(lambda x: [(1, x[i][1]) if i < len(x)/4 else ((2, x[i][1]) if i < 2*len(x)/4 else ((3, x[i][1]) if i < 3*len(x)/4 else (4, x[i][1]))) for i in range(len(x))]).flatMapValues(lambda x: x)

players = ['James Harden', 'Chris Paul', 'Stephen Curry', 'Lebron James']
for player in players:
    player_zone_hit_rates = player_zones_classified.filter(lambda x: x[0] == player).map(lambda x: (x[1][0], x[1][1], x[1][2], x[1][3])).groupByKey().mapValues(lambda x: sum(x)/len(x))
    best_zone = player_zone_hit_rates.reduce(lambda a,b: a if a[1]>b[1] else b)
    print("Player {}: best zone is {}".format(player, best_zone))