In [1]:
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import json
import os
import sys
import re
import numpy as np
import time
import datetime
 
# Path for spark source folder
os.environ['SPARK_HOME'] = "/usr/local/spark"

# Append pyspark to Python Path
sys.path.append("/usr/local/spark/python")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
from pyspark.sql import SQLContext
from pyspark.sql import HiveContext
# sc = SparkContext() # not needed in IPython notebook.
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)

# Append afinn to Python Path and import afinn.  Used for pulling data from percentiles.
sys.path.append("/usr/local/lib/python2.7/dist-packages/afinn")
from afinn import Afinn

# Stuff for logistic regression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg import SparseVector
from pyspark.sql.types import StructField, BooleanType, StringType, LongType, StructType
sys.path.append("/usr/local/lib/python2.7/dist-packages")
from tdigest import TDigest
from numpy.random import random
from operator import add

# 0.0 Read data

Read using json schema.  If you don't use schema on read, Spark reads ENTIRE FILE to infer schema BEFORE actually reading in data.

In [2]:
# Define json schema to speed up reading json files in S3

fields = [StructField("archived", BooleanType(), True),
        StructField("author", StringType(), True),
        StructField("author_flair_css_class", StringType(), True),
        StructField("body", StringType(), True),
        StructField("controversiality", LongType(), True),
        StructField("created_utc", StringType(), True),
        StructField("distinguished", StringType(), True),
        StructField("downs", LongType(), True),
        StructField("edited", StringType(), True),
        StructField("gilded", LongType(), True),
        StructField("id", StringType(), True),
        StructField("link_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("parent_id", StringType(), True),
        StructField("retrieved_on", LongType(), True),
        StructField("score", LongType(), True),
        StructField("score_hidden", BooleanType(), True),
        StructField("subreddit", StringType(), True),
        StructField("subreddit_id", StringType(), True),
        StructField("ups", LongType(), True)]

Try reading a SINGLE FILE of size 4.1 GB.

In [3]:
df = hiveContext.read.json("s3n://reddit-comments/2011/RC_2011-03", StructType(fields))
# df.take(5)

In [4]:
subreddits = [u'leagueoflegends', u'GirlGamers', u'pics', u'politics']

<b>Refactor later to filter based on subreddits list.  You can't do "in list" with dataframe SQL but I think you can do it with vanilla RDD. 

In [5]:
# Filter down to subreddits of interest

df2 = (df.filter(  (df.subreddit == u'leagueoflegends') 
                 | (df.subreddit == u'GirlGamers')
                 | (df.subreddit == u'pics')
                 | (df.subreddit == u'politics') )                   
           .persist(StorageLevel.MEMORY_AND_DISK_SER)
           )
# df2.take(5)

<b>Comment out check for records if you know the subreddits are there.

In [8]:
isInData = {key: True for key in subreddits}

def is_there(df,srList):
    return {key : True if df2.filter((df.subreddit == key)).take(1) else False for key in srList }

isInData = is_there(df2, subreddits)
print isInData

<b>DO NOT CUT MAY NEED THIS LATER:  code below finds out how many records there are for each subreddit.

subreddits = [u'leagueoflegends', u'GirlGamers', u'pics', u'politics']

def create_counts(df,srList):
    return {key : df2.filter((df.subreddit == key)).count() for key in srList}

srCounts = create_counts(df2, subreddits)
print srCounts

<b>Move rRDD creation later: it is used first when creating rRDDExtreme with only top/bottom 3%.

In [55]:
# Put Dataframe into vanilla RDD

rRDD = (df2.map(lambda r: (r.id, (r.body, int(r.created_utc), r.link_id, r.parent_id, int(r.score), r.subreddit, r.subreddit_id)))
          .persist(StorageLevel.MEMORY_AND_DISK_SER)
       )
# rRDD.take(5)

# 2.0 Filter to include only extreme up and down votes (top 3% of subreddit)

Filter to retain only records that are top or bottom 3% in comment score (upvotes-downvotes) of their subreddit.  Reduces dataset for all subsequent processing.

ALTERNATIVE calculation of 3 and 97 percentiles using SQL and HiveQL percentile estimate.

In [9]:
def createsrDict(df, srList, isIn):
    hiveContext.registerDataFrameAsTable(df, "rcomments")
    srDigest = {}
    for key in srList:
        if isIn[key]:  # if the subreddit is in the input data set 
            # not sure if percentile() [integers] or percentile_approx() [double] runs faster.
            SQL = "select percentile(score, array(0.03,0.97)) from rcomments where subreddit=="+"'" + key + "'"
            srDigest[key] = hiveContext.sql(SQL).collect()[0][0]
    return srDigest

subredditDigest = createsrDict(df2, subreddits, isInData) 
print subredditDigest

{u'politics': [-3.0, 17.0], u'pics': [-2.0, 28.0], u'leagueoflegends': [-1.0, 12.0], u'GirlGamers': [0.0, 14.019999999999982]}


In [10]:
srDigestR = {key : (round(subredditDigest[key][0]), 
                    round(subredditDigest[key][1]) ) 
             for key in subredditDigest.keys()}
print srDigestR

{u'politics': (-3.0, 17.0), u'GirlGamers': (0.0, 14.0), u'leagueoflegends': (-1.0, 12.0), u'pics': (-2.0, 28.0)}


Alyssa's values for politics were (-6.0, 24.0) rounded for April and May 2015.  

In [11]:
rRDDExtreme = rRDD.filter(lambda (k,v): v[4] < srDigestR[v[5]][0] or v[4] > srDigestR[v[5]][1])

# 1.0 Find minimum comment timestamp for each post

For each post link_id, find minimum created_utc timestamp [can't trust data is ordered by time] and store in key-value pair (pair RDD) {link_id: min_created_utc},  (Plan B:  set up API to get timestamp for all posts in Reddit)

Fin min time comment using HiveQL

In [56]:
hiveContext.registerDataFrameAsTable(df2, "rcomments")
minTimeDF = hiveContext.sql("select link_id, min(created_utc) as min_utc from rcomments group by link_id")
minTimeRDD = minTimeDF.map(lambda r: (r.link_id, r.min_utc)).persist(StorageLevel.MEMORY_AND_DISK_SER)
# minTimeRDD.count()

48740

# 3.0  Calculate timeSince

Calculate time since post was created based on created_utc and min_created_utc from pair RDD.  In Alyssa's IPython notebook this is called timeSince.  In her R code it's called recency.  

For this I need a left outer join.  For each element (k, v) in self, the resulting RDD will either contain all pairs (k, (v, w)) for w in other, or the pair (k, (v, None)) if no elements in other have key k.  I need to know if there are records in my comment dataset for which there is no "minimum time", as this would indicate a processing error.

Map RDD to get post link_id as key, then join with minTimeRDD.

<b>CAN I USE DATAFRAMES FOR ALL THIS? GETTING TIRED OF REMEMBERING WHAT v[4] IS.  </b>

The only data I need for regression are:  C(subreddit) + timeSince + commentLength + posNegDiff.  Need to keep only comment id (key), score, subreddit, timeSince, comment text from this step.

Format of output RDD is (id,(body,timeSince,score,subreddit))

In [13]:
rRDDXts = (rRDDExtreme.map(lambda (k,v):  (v[2],(k,v[0],v[1],v[2],v[3],v[4],v[5],v[6])))  # pull link_id as key
                      .leftOuterJoin(minTimeRDD) # join on link_id (post)
                      .map(lambda (link_id,(x,min_utc)):  (x[0], (x[1],x[2]-min_utc,x[5],x[6])))
                      .persist(StorageLevel.MEMORY_AND_DISK_SER)
          )

rRDDXts.take(10)

# 4.0  Calculate commentLength

Clean comment body and calculate commentLength.

R gsub:
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)

Python re:
re.sub(pattern, repl, string, count=0, flags=0).  Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.

<B>NOTE:  ALYSSA REMOVED QUOTED COMMENTS.  I REMOVING THEM ALSO BUT IN MY FIRST EXAMPLE I FOUND A "MADE UP" QUOTE THAT ISN'T REALLY QUOTING SOMEONE ELSE'S POST.  

In [14]:
def cleanup(body):

	# Recode HTML codes
	body = re.sub("&gt;", ">", body)
	body = re.sub("&lt;", "<", body)
	body = re.sub("&amp;", "&", body)
	body = re.sub("&nbsp;", " ", body)

	# Remove deleted
	body = re.sub("^[deleted]$", "", body)

	# Remove URL
	body = re.sub("http[[:alnum:][:punct:]]*", " ", body) # url

	# Remove /r/subreddit, /u/user
	body = re.sub("/r/[[:alnum:]]+|/u/[[:alnum:]]+", " ", body)

	# Remove quoted comments
	body = re.sub("(>.*?\\n\\n)+", " ", body)

	# Remove control characters (\n, \b)
	body = re.sub("[[:cntrl:]]", " ", body)

	# Remove single quotation marks (contractions)
	body = re.sub("'", "", body)

	# Remove punctuation
	body = re.sub("[[:punct:]]", " ", body)

	# Replace multiple spaces with single space
	body = re.sub("\\s+", " ", body) # Multiple spaces
	# body = re.sub("^\\s+", "", body) # Space at the start of the string
	# body = re.sub("+\\s$", "", body) # Space at the end of the string
	body = body.strip()

	# Lower case
	body = body.lower()

	# Return comment length (number of words) and body (cleaned up text)
	return body

clbody = cleanup(u"Basically, the hospital's position amounts to:\n\n&gt; If she can't hold her roofies she deserves to be a**f****d and denied medical care and collection of evidence!\n\nNot the *most* progressive attitude...")

Current format of RDD:  (id,(body,timeSince,score,subreddit))
Format of rRDDXtscl:  (id,(commentLength,body,timeSince,score,subreddit))

In [15]:
rRDDXtscl = (rRDDXts.map(lambda (id,(body,timeSince,score,subreddit)): (id,(cleanup(body),timeSince,score,subreddit)))
                    .map(lambda (id,(body,timeSince,score,subreddit)): (id,(len(body.split()),body,timeSince,score,subreddit)))
             ).persist(StorageLevel.MEMORY_AND_DISK_SER)

# 5.0 (Filter out exclusions if necessary; skip for now)

Filter out exclusions.  Further reduces dataset.

# 6.0 Run sentiment analysis and calculate posNegDiff

Use AFINN model to do sentiment analysis.

Finn Årup Nielsen, "A new ANEW: evaluation of a word list for sentiment analysis in microblogs" , Proceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages 718 in CEUR Workshop Proceedings: 93-98. 2011 May. Matthew Rowe, Milan Stankovic, Aba-Sah Dadzie, Mariann Hardey (editors)

<B>I'M WONDERING IF CREATING AN AFINN OBJECT EACH TIME IS TAKING TOO LONG.  MIGHT WANT TO REFACTOR THIS TO A PYTHON FUNCTION WITH A DICT LOOKUP AND USE A BROADCAST VARIABLE.  

In [16]:
def sentiment(body):
    afinn = Afinn()
    return afinn.score(body)

sentiment("This is utterly excellent!")

In [17]:
rRDDtscls = (rRDDXtscl.map(lambda (id,(commentLength,body,timeSince,score,subreddit)):  
                        (id,(commentLength,sentiment(body),timeSince,score,subreddit)))
                      .persist(StorageLevel.MEMORY_AND_DISK_SER)
             )

rRDDtscls.take(5)

rRDDtscls.count()

# 7.0 Set up logistic regression inputs with OHE features for categorical variable subredddit

Calculate label from score using srDigestR and create rawData RDD in proper format:  (label, non-categorical variables, categorical variable)

In [18]:
def label(score, subreddit, percentMap):
    if score <= percentMap[subreddit][0]: return 0
    else: return 1

Format of rRDDtscls:  (id,(commentLength,posNegDiff,timeSince,score,subreddit)))

Format of rawData is a tuple:  (label, (0,commentLength), (1,posNegDiff), (2,timeSince), subreddit))

def tupToString(tup):
    return ','.join([str(item) for item in tup])

In [19]:
rawData = (rRDDtscls.map(lambda (id,(commentLength,posNegDiff,timeSince,score,subreddit)):  
                    (label(score,subreddit,srDigestR), (0,commentLength), (1,posNegDiff), (2,timeSince), subreddit))
                    .persist(StorageLevel.MEMORY_AND_DISK_SER)
          )

type(rawData)

rawData.take(5)

rawData.count()

Split data into training, validation and test. Takes about 1-2 minutes for ~5 MB data set, greatly reduced from 85 MB.

In [20]:
weights = [.8, .1, .1]
seed = 42
# Use randomSplit with weights and seed
rawTrainData, rawValData, rawTestData = rawData.randomSplit(weights, seed)

# Cache the data
rawTrainData.persist(StorageLevel.MEMORY_AND_DISK_SER)
rawValData.persist(StorageLevel.MEMORY_AND_DISK_SER)
rawTestData.persist(StorageLevel.MEMORY_AND_DISK_SER)

# These counts are expensive:  ~1 hour for 10 GB input data.
# nAll = rawData.count()
# nTrain = rawTrainData.count()
# nVal = rawValData.count()
# nTest = rawTestData.count()
# print nTrain, nVal, nTest, nTrain + nVal + nTest, nAll

# print rawData.take(1)

PythonRDD[68] at RDD at PythonRDD.scala:43

Create one hot encoding dictionary.

I DON'T NEED ML LAB 4 createOneHotDict BECAUSE I ALREADY HAVE A LIST OF SUBREDDITS IN srDigestR.  Just need to pull keys out of this dict to create my OHEdict. NOTE that my OHEdict is already shifted by 3 to avoid collision with non-categorical variables.

<B> NEED TO FIX OHEdict:  should only pull categories out of TRAINING set, NOT entire set.  Otherwise it's cheating.

OHEdict = {(0, v): k+3 for (k, v) in enumerate(srDigestR)}

In [21]:
def createOHEMap(sr):
    if sr == u'leagueoflegends': return (3,1)
    elif sr == u'pics' : return (4,1)
    elif sr == u'politics' : return (5,1)
    else: return (5,0)

OHEdict[(0,'politics')]

Create OHETrainData.

In [22]:
def createLabeledPoint(point,numFeats):
    label = point[0]
    feats = point[1:]
    sv = SparseVector(numFeats, feats)
    # print sv
    return LabeledPoint(label, sv)

len(OHEdict)+3

createLabeledPoint((1, (0, 36), (1, -11.0), (2, 811), (25, 1)),   len(OHEdict+3  )

In [23]:
# numFeats = len(OHEdict)+3
numFeats = 6
OHETrainData = (rawTrainData.map(lambda (label, t1, t2, t3, sr):
                                       (label, t1, t2, t3, createOHEMap(sr) ))
                            .map(lambda point:  createLabeledPoint(point, numFeats))
                            .persist(StorageLevel.MEMORY_AND_DISK_SER)
                )


OHETrainData.persist(StorageLevel.MEMORY_AND_DISK_SER)

OHETrainData.take(10)

(Create OHEValData and OHETestData; skip for now)

OHEValData = (rawValData.map(lambda (label, t1, t2, t3, sr):
                                       (label, t1, t2, t3, (OHEdict[(0,sr)], 1) ))
                            .map(lambda point:  createLabeledPoint(point, numFeats))
                )

OHETestData = (rawTestData.map(lambda (label, t1, t2, t3, sr):
                                       (label, t1, t2, t3, (OHEdict[(0,sr)], 1) ))
                            .map(lambda point:  createLabeledPoint(point, numFeats))
                )

# 8.0 Run logistic regression

Set up hyperparameters

In [34]:
# fixed hyperparameters
numIters = 500
stepSize = 1.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

Run logistic regression.

<b> LATER, ADD VALIDATION STEP:  ITERATE OVER HYPERPARAMETERS TO FIND BEST COMBINATION.

<b>STOP!!!  Before running LR, fix minTimeRDD with .min()

In [35]:
model0 = LogisticRegressionWithSGD.train(OHETrainData, iterations = numIters, step = stepSize,
                                        regParam = regParam, regType = regType,
                                        intercept = includeIntercept)
# sortedWeights = sorted(model0.weights)
# print sortedWeights[:5], model0.intercept
print model0.weights, model0.intercept

[139.105918341,3.74795117483,120.985595236,-0.00558833628498,2.31164589822,0.93166900632] 4.23741570639


print sortedWeights, model0.intercept

# 9.0 Evaluate results using test set

Calculate accuracy = number of correctly classified examples / total number of examples.

In [43]:
for key in srDigestR.keys():
    print key, df2.filter(df2['subreddit'] == key).count()

politics 316521
GirlGamers 467
leagueoflegends 30034
pics 581447


In [41]:
model0Predictions = OHETrainData.map(lambda point: model0.predict(point.features))
print model0Predictions.take(1000)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [42]:
print OHETrainData.map(lambda point: point.label).take(1000)

[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,

In [36]:
model0TotalCorrect = OHETrainData.map(lambda point:  1 if model0.predict(point.features) == point.label else 0).sum()
print model0TotalCorrect

21795


In [37]:
OHETrainDataCount = OHETrainData.count()
print OHETrainDataCount

41571


In [38]:
model0Accuracy = float(model0TotalCorrect) / float(OHETrainData.count())
print model0Accuracy

0.524283755503


<b> TO DO:  Calculate ROC AUC (receiver operating characteristic - area under curve).  

<b>TO DO:  calculate confusion matrix.

In [None]:
trainingPredictionsAuto=(OHETrainData
                     .map(lambda lpoint: model0.predict(lpoint.features))
                     )

In [None]:
print trainingPredictionsAuto.take(150)

In [None]:
numNotNinesAuto = trainingPredictionsAuto.filter(lambda P: 1-P > 0.001).count() # how many predictions are not ~= 1

In [None]:
print numNotNinesAuto

In [None]:
print trainingPredictionsAuto.count()

Train model with default values.  Any better?

In [None]:
modelDefault = LogisticRegressionWithSGD.train(OHETrainData, iterations = numIters,
                                        intercept = includeIntercept)
sortedWeights = sorted(modelDefault.weights)
# print sortedWeights[:5], model0.intercept

In [None]:
print sortedWeights

In [None]:
print modelDefault.weights

In [None]:
trainingPredDefault=(OHETrainData
                     .map(lambda lpoint: modelDefault.predict(lpoint.features))
                     )

In [None]:
print trainingPredDefault.take(150)

In [None]:
print trainingPredDefault.filter(lambda P: 1-P > 0.001).count()

ROC plot

def bucketFeatByCount(featCount):
    """Bucket the counts by powers of two."""
    for i in range(11):
        size = 2 ** i
        if featCount <= size:
            return size
    return -1

featCounts = (OHETrainData
              .flatMap(lambda lp: lp.features.indices)
              .map(lambda x: (x, 1))
              .reduceByKey(lambda x, y: x + y))
featCountsBuckets = (featCounts
                     .map(lambda x: (bucketFeatByCount(x[1]), 1))
                     .filter(lambda (k, v): k != -1)
                     .reduceByKey(lambda x, y: x + y)
                     .collect())

x, y = zip(*featCountsBuckets)
x, y = np.log(x), np.log(y)

def preparePlot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999',
                gridWidth=1.0):
    """Template for generating the plot layout."""
    plt.close()
    fig, ax = plt.subplots(figsize=figsize, facecolor='white', edgecolor='white')
    ax.axes.tick_params(labelcolor='#999999', labelsize='10')
    for axis, ticks in [(ax.get_xaxis(), xticks), (ax.get_yaxis(), yticks)]:
        axis.set_ticks_position('none')
        axis.set_ticks(ticks)
        axis.label.set_color('#999999')
        if hideLabels: axis.set_ticklabels([])
    plt.grid(color=gridColor, linewidth=gridWidth, linestyle='-')
    map(lambda position: ax.spines[position].set_visible(False), ['bottom', 'top', 'left', 'right'])
    return fig, ax

labelsAndScores = OHEValData.map(lambda lp:
                                            (lp.label, getP(lp.features, model0.weights, model0.intercept)))
labelsAndWeights = labelsAndScores.collect()
labelsAndWeights.sort(key=lambda (k, v): v, reverse=True)
labelsByWeight = np.array([k for (k, v) in labelsAndWeights])

length = labelsByWeight.size
truePositives = labelsByWeight.cumsum()
numPositive = truePositives[-1]
falsePositives = np.arange(1.0, length + 1, 1.) - truePositives

truePositiveRate = truePositives / numPositive
falsePositiveRate = falsePositives / (length - numPositive)

# Generate layout and plot data
fig, ax = preparePlot(np.arange(0., 1.1, 0.1), np.arange(0., 1.1, 0.1))
ax.set_xlim(-.05, 1.05), ax.set_ylim(-.05, 1.05)
ax.set_ylabel('True Positive Rate (Sensitivity)')
ax.set_xlabel('False Positive Rate (1 - Specificity)')
plt.plot(falsePositiveRate, truePositiveRate, color='#8cbfd0', linestyle='-', linewidth=3.)
plt.plot((0., 1.), (0., 1.), linestyle='--', color='#d6ebf2', linewidth=2.)  # Baseline model
pass