In [99]:
import matplotlib.pyplot as plt
import json
import os
import sys
import re
 
# Path for spark source folder
os.environ['SPARK_HOME'] = "/Users/jonneff/spark-1.4.0-bin-hadoop2.6"

# Append pyspark to Python Path
sys.path.append("/Users/jonneff/spark-1.4.0-bin-hadoop2.6/python")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# Append afinn to Python Path and import afinn.  Used for pulling data from percentiles.
sys.path.append("/Users/jonneff/anaconda/lib/python2.7/site-packages/")
from afinn import Afinn

# Stuff for logistic regression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg import SparseVector

In [2]:
from tdigest import TDigest
from numpy.random import random
from operator import add

# 0.0 Read data

Read in data.  For now, on local filesystem.  

<B>ADD CODE TO READ FROM S3.  

In [3]:
df = sqlContext.read.json("/Users/jonneff/Desktop/DE/RC_2007-10")

In [4]:
df.count()

150429

In [5]:
df.dtypes

[('archived', 'boolean'),
 ('author', 'string'),
 ('author_flair_css_class', 'string'),
 ('author_flair_text', 'string'),
 ('body', 'string'),
 ('controversiality', 'bigint'),
 ('created_utc', 'string'),
 ('distinguished', 'string'),
 ('downs', 'bigint'),
 ('edited', 'string'),
 ('gilded', 'bigint'),
 ('id', 'string'),
 ('link_id', 'string'),
 ('name', 'string'),
 ('parent_id', 'string'),
 ('retrieved_on', 'bigint'),
 ('score', 'bigint'),
 ('score_hidden', 'boolean'),
 ('subreddit', 'string'),
 ('subreddit_id', 'string'),
 ('ups', 'bigint')]

In [6]:
df.printSchema()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [7]:
df.head(1)

[Row(archived=True, author=u'bostich', author_flair_css_class=None, author_flair_text=None, body=u'test', controversiality=0, created_utc=u'1192450635', distinguished=None, downs=0, edited=u'false', gilded=0, id=u'c0299an', link_id=u't3_5yba3', name=u't1_c0299an', parent_id=u't3_5yba3', retrieved_on=1427426409, score=1, score_hidden=False, subreddit=u'reddit.com', subreddit_id=u't5_6', ups=1)]

In [8]:
df.take(1)

[Row(archived=True, author=u'bostich', author_flair_css_class=None, author_flair_text=None, body=u'test', controversiality=0, created_utc=u'1192450635', distinguished=None, downs=0, edited=u'false', gilded=0, id=u'c0299an', link_id=u't3_5yba3', name=u't1_c0299an', parent_id=u't3_5yba3', retrieved_on=1427426409, score=1, score_hidden=False, subreddit=u'reddit.com', subreddit_id=u't5_6', ups=1)]

# 1.0 Find minimum comment timestamp for each post

For each post link_id, find minimum created_utc timestamp [can't trust data is ordered by time] and store in key-value pair (pair RDD) {link_id: min_created_utc},  (Plan B:  set up API to get timestamp for all posts in Reddit)

In [9]:
redditRDD = df.rdd

In [10]:
redditRDD.take(1)

[Row(archived=True, author=u'bostich', author_flair_css_class=None, author_flair_text=None, body=u'test', controversiality=0, created_utc=u'1192450635', distinguished=None, downs=0, edited=u'false', gilded=0, id=u'c0299an', link_id=u't3_5yba3', name=u't1_c0299an', parent_id=u't3_5yba3', retrieved_on=1427426409, score=1, score_hidden=False, subreddit=u'reddit.com', subreddit_id=u't5_6', ups=1)]

<B>I'M KEEPING TOO MUCH DATA.  AT THIS STEP, AND SUBSEQUENT STEPS, THROW OUT DATA I'M NOT USING.

In [11]:
rRDD = df.map(lambda r: (r.id, (r.body, int(r.created_utc), r.link_id, r.parent_id, int(r.score), r.subreddit, r.subreddit_id)))

In [12]:
rRDD.take(1)

[(u'c0299an',
  (u'test', 1192450635, u't3_5yba3', u't3_5yba3', 1, u'reddit.com', u't5_6'))]

Sort to find minimum timestamp created_utc for each post link_id

In [13]:
rRDD.count()

150429

In [14]:
minTimeRDD = (rRDD.map(lambda (k, v): (v[2],v[1])) # maps to (link_id, created_utc)
                  .reduceByKey(lambda a, b:  a if a < b else b)
              )

In [15]:
minTimeRDD.take(1)

[(u't3_5yxsg', 1193154263)]

In [16]:
minTimeRDD.count()

24370

# 2.0 Filter to include only extreme up and down votes (top 3% of subreddit)

Filter to retain only records that are top or bottom 3% in comment score (upvotes-downvotes) of their subreddit.  Reduces dataset for all subsequent processing.

Find 3% and 97% percentiles for each subreddit.  Use T-digest data structure for highly accurate approximate percentiles: 
http://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest

In [17]:
def digest1(value):
    digest = TDigest()
    digest.update(value)
    return digest

Need to get at scores for each subreddit, THEN map using digest.


<b>REFACTOR AND MAKE THIS RUN FASTER.  I'M CALLING DIGEST1 TOO MANY TIMES.</b>

In [18]:
subredditDigestRDD = (rRDD.map(lambda (k, v):  (v[5], v[4]))
                        .map(lambda (k, v): (k, digest1(v)))  
                        .reduceByKey(lambda a, b:  a + b)
                   )

In [19]:
type(subredditDigestRDD)

pyspark.rdd.PipelinedRDD

Creating digest dictionary takes a few minutes.  

In [20]:
subredditDigest = subredditDigestRDD.collectAsMap()

In [21]:
type(subredditDigest['politics'])

tdigest.tdigest.TDigest

In [22]:
subredditDigest['politics'].percentile(3)

-5.3696052631578945

In [23]:
subredditDigest['politics'].percentile(97)

16.797647058823536

Alyssa's values for politics were (-6.0, 24.0) rounded.  

Set limits for what data to include by percentile.  Use only data less than lowPercentile or greater than highPercentile.

In [46]:
lowPercentile, highPercentile = 3, 97

In [47]:
srDigestR = {key : (round(subredditDigest[key].percentile(lowPercentile)), 
                    round(subredditDigest[key].percentile(highPercentile))) 
             for key in subredditDigest.keys()}

In [48]:
srDigestR['politics']

(-5.0, 17.0)

In [49]:
len(srDigestR)

32

In [50]:
print srDigestR

{u'eo': (1.0, 1.0), u'arxiv': (1.0, 1.0), u'zh': (1.0, 1.0), u'features': (-1.0, 3.0), u'request': (0.0, 1.0), u'it': (1.0, 1.0), u'sv': (1.0, 1.0), u'gadgets': (-0.0, 2.0), u'nsfw': (-3.0, 11.0), u'politics': (-5.0, 17.0), u'id': (1.0, 2.0), u'es': (1.0, 1.0), u'ru': (0.0, 1.0), u'netsec': (0.0, 2.0), u'ads': (0.0, 4.0), u'entertainment': (-1.0, 2.0), u'tr': (0.0, 1.0), u'sports': (-2.0, 3.0), u'freeculture': (1.0, 1.0), u'gaming': (-0.0, 5.0), u'fr': (1.0, 1.0), u'business': (-0.0, 2.0), u'reddit.com': (-3.0, 23.0), u'de': (0.0, 4.0), u'lipstick.com': (2.0, 2.0), u'ja': (0.0, 2.0), u'science': (-4.0, 20.0), u'joel': (1.0, 1.0), u'no': (1.0, 1.0), u'programming': (-4.0, 23.0), u'bugs': (1.0, 3.0), u'sl': (1.0, 1.0)}


Filter records to retain only top and bottom 3% of comment scores.

<b>NOTE:  NEED TO BROADCAST srDigestR WHEN I MOVE TO CLUSTER.

In [28]:
rRDDExtreme = rRDD.filter(lambda (k,v): v[4] < srDigestR[v[5]][0] or v[4] > srDigestR[v[5]][1])

In [29]:
rRDDExtreme.count()

8760

# 3.0  Calculate timeSince

Calculate time since post was created based on created_utc and min_created_utc from pair RDD.  In Alyssa's IPython notebook this is called timeSince.  In her R code it's called recency.  

For this I need a left outer join.  For each element (k, v) in self, the resulting RDD will either contain all pairs (k, (v, w)) for w in other, or the pair (k, (v, None)) if no elements in other have key k.  I need to know if there are records in my comment dataset for which there is no "minimum time", as this would indicate a processing error.

Map RDD to get post link_id as key, then join with minTimeRDD.

<b>CAN I USE DATAFRAMES FOR ALL THIS? GETTING TIRED OF REMEMBERING WHAT v[4] IS.  </b>

The only data I need for regression are:  C(subreddit) + timeSince + commentLength + posNegDiff.  Need to keep only comment id (key), score, subreddit, timeSince, comment text from this step.

Format of output RDD is (id,(body,timeSince,score,subreddit))

In [30]:
rRDDXts = (rRDDExtreme.map(lambda (k,v):  (v[2],(k,v[0],v[1],v[2],v[3],v[4],v[5],v[6])))  # pull link_id as key
                      .leftOuterJoin(minTimeRDD) # join on link_id (post)
                      .map(lambda (link_id,(x,min_utc)):  (x[0], (x[1],x[2]-min_utc,x[5],x[6])))
                      .cache()
          )

In [31]:
rRDDXts.take(10)

[(u'c02biau', (u'[deleted]', 0, -10, u'reddit.com')),
 (u'c029tih',
  (u"Basically, the hospital's position amounts to:\n\n&gt; If she can't hold her roofies she deserves to be assfucked and denied medical care and collection of evidence!\n\nNot the *most* progressive attitude...",
   0,
   70,
   u'reddit.com')),
 (u'c029tkl',
  (u'I guess I kind of understand how a drunk person cannot "consent" to a rape kit because of the type of tests performed. But jesus, that\'s when most rape cases happen -- especially date rape! Sad.',
   811,
   36,
   u'reddit.com')),
 (u'c029tl9',
  (u'"an aftereffect of date rape drug?" No.\r\n\r\nThe Reddit headline is not the place for original research.',
   1151,
   -26,
   u'reddit.com')),
 (u'c029tmd',
  (u"What the hell are you trying to say? Sure the 'after' was an unnecessary addition to the 'effect', as intoxication is actually the *primary* effect of date rape drugs, but a minor semantics error isn't worth a vitriolic comment.",
   1705,
   27,
 

# 4.0  Calculate commentLength

Clean comment body and calculate commentLength.

R gsub:
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)

Python re:
re.sub(pattern, repl, string, count=0, flags=0).  Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.

<B>NOTE:  ALYSSA REMOVED QUOTED COMMENTS.  I REMOVING THEM ALSO BUT IN MY FIRST EXAMPLE I FOUND A "MADE UP" QUOTE THAT ISN'T REALLY QUOTING SOMEONE ELSE'S POST.  

In [32]:
def cleanup(body):

	# Recode HTML codes
	body = re.sub("&gt;", ">", body)
	body = re.sub("&lt;", "<", body)
	body = re.sub("&amp;", "&", body)
	body = re.sub("&nbsp;", " ", body)

	# Remove deleted
	body = re.sub("^[deleted]$", "", body)

	# Remove URL
	body = re.sub("http[[:alnum:][:punct:]]*", " ", body) # url

	# Remove /r/subreddit, /u/user
	body = re.sub("/r/[[:alnum:]]+|/u/[[:alnum:]]+", " ", body)

	# Remove quoted comments
	body = re.sub("(>.*?\\n\\n)+", " ", body)

	# Remove control characters (\n, \b)
	body = re.sub("[[:cntrl:]]", " ", body)

	# Remove single quotation marks (contractions)
	body = re.sub("'", "", body)

	# Remove punctuation
	body = re.sub("[[:punct:]]", " ", body)

	# Replace multiple spaces with single space
	body = re.sub("\\s+", " ", body) # Multiple spaces
	# body = re.sub("^\\s+", "", body) # Space at the start of the string
	# body = re.sub("+\\s$", "", body) # Space at the end of the string
	body = body.strip()

	# Lower case
	body = body.lower()

	# Return comment length (number of words) and body (cleaned up text)
	return body

In [33]:
clbody = cleanup(u"Basically, the hospital's position amounts to:\n\n&gt; If she can't hold her roofies she deserves to be assfucked and denied medical care and collection of evidence!\n\nNot the *most* progressive attitude...")

In [34]:
print len(clbody.split())

11


In [35]:
print clbody

basically, the hospitals position amounts to: not the *most* progressive attitude...


Current format of RDD:  (id,(body,timeSince,score,subreddit))
Format of rRDDXtscl:  (id,(commentLength,body,timeSince,score,subreddit))

In [36]:
rRDDXtscl = (rRDDXts.map(lambda (id,(body,timeSince,score,subreddit)): (id,(cleanup(body),timeSince,score,subreddit)))
                    .map(lambda (id,(body,timeSince,score,subreddit)): (id,(len(body.split()),body,timeSince,score,subreddit)))
             )

# 5.0 (Filter out exclusions if necessary; skip for now)

Filter out exclusions.  Further reduces dataset.

# 6.0 Run sentiment analysis and calculate posNegDiff

Use AFINN model to do sentiment analysis.

Finn Årup Nielsen, "A new ANEW: evaluation of a word list for sentiment analysis in microblogs" , Proceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages 718 in CEUR Workshop Proceedings: 93-98. 2011 May. Matthew Rowe, Milan Stankovic, Aba-Sah Dadzie, Mariann Hardey (editors)

<B>I'M WONDERING IF CREATING AN AFINN OBJECT EACH TIME IS TAKING TOO LONG.  MIGHT WANT TO REFACTOR THIS TO A PYTHON FUNCTION WITH A DICT LOOKUP AND USE A BROADCAST VARIABLE.  

In [37]:
def sentiment(body):
    afinn = Afinn()
    return afinn.score(body)

In [38]:
sentiment("This is utterly excellent!")

3.0

In [40]:
rRDDtscls = (rRDDXtscl.map(lambda (id,(commentLength,body,timeSince,score,subreddit)):  
                        (id,(commentLength,sentiment(body),timeSince,score,subreddit)))
                      .cache()
             )

In [41]:
rRDDtscls.take(5)

[(u'c02biau', (1, 0.0, 0, -10, u'reddit.com')),
 (u'c029tih', (11, 0.0, 0, 70, u'reddit.com')),
 (u'c029tkl', (36, -11.0, 811, 36, u'reddit.com')),
 (u'c029tl9', (17, -5.0, 1151, -26, u'reddit.com')),
 (u'c029tmd', (39, -11.0, 1705, 27, u'reddit.com'))]

In [57]:
rRDDtscls.count()

8760

# 7.0 Set up logistic regression inputs with OHE features for categorical variable subredddit

Calculate label from score using srDigestR and create rawData RDD in proper format:  (label, non-categorical variables, categorical variable)

In [51]:
def label(score, subreddit, percentMap):
    if score <= percentMap[subreddit][0]: return 0
    else: return 1

Format of rRDDtscls:  (id,(commentLength,posNegDiff,timeSince,score,subreddit)))

Format of rawData is a tuple:  (label, (0,commentLength), (1,posNegDiff), (2,timeSince), subreddit))

def tupToString(tup):
    return ','.join([str(item) for item in tup])

In [120]:
rawData = (rRDDtscls.map(lambda (id,(commentLength,posNegDiff,timeSince,score,subreddit)):  
                    (label(score,subreddit,srDigestR), (0,commentLength), (1,posNegDiff), (2,timeSince), subreddit))
                    .cache()
          )

In [121]:
type(rawData)

pyspark.rdd.PipelinedRDD

In [122]:
rawData.take(5)

[(0, (0, 1), (1, 0.0), (2, 0), u'reddit.com'),
 (1, (0, 11), (1, 0.0), (2, 0), u'reddit.com'),
 (1, (0, 36), (1, -11.0), (2, 811), u'reddit.com'),
 (0, (0, 17), (1, -5.0), (2, 1151), u'reddit.com'),
 (1, (0, 39), (1, -11.0), (2, 1705), u'reddit.com')]

In [123]:
rawData.count()

8760

Split data into training, validation and test. 

In [124]:
weights = [.8, .1, .1]
seed = 42
# Use randomSplit with weights and seed
rawTrainData, rawValData, rawTestData = rawData.randomSplit(weights, seed)

# Cache the data
rawTrainData.cache()
rawValData.cache()
rawTestData.cache()

nAll = rawData.count()
nTrain = rawTrainData.count()
nVal = rawValData.count()
nTest = rawTestData.count()
print nTrain, nVal, nTest, nTrain + nVal + nTest, nAll

# print rawData.take(1)

7035 882 843 8760 8760


Create one hot encoding dictionary.

I DON'T NEED ML LAB 4 createOneHotDict BECAUSE I ALREADY HAVE A LIST OF SUBREDDITS IN srDigestR.  Just need to pull keys out of this dict to create my OHEdict. NOTE that my OHEdict is already shifted by 3 to avoid collision with non-categorical variables.

In [116]:
OHEdict = {(0, v): k+3 for (k, v) in enumerate(srDigestR)}

In [117]:
print OHEdict

{(0, u'sports'): 20, (0, u'nsfw'): 11, (0, u'sv'): 9, (0, u'gadgets'): 10, (0, u'no'): 31, (0, u'science'): 29, (0, u'programming'): 32, (0, u'gaming'): 22, (0, u'tr'): 19, (0, u'business'): 24, (0, u'ads'): 17, (0, u'joel'): 30, (0, u'zh'): 5, (0, u'id'): 13, (0, u'fr'): 23, (0, u'freeculture'): 21, (0, u'ru'): 15, (0, u'lipstick.com'): 27, (0, u'politics'): 12, (0, u'bugs'): 33, (0, u'reddit.com'): 25, (0, u'arxiv'): 4, (0, u'features'): 6, (0, u'es'): 14, (0, u'netsec'): 16, (0, u'it'): 8, (0, u'entertainment'): 18, (0, u'request'): 7, (0, u'de'): 26, (0, u'ja'): 28, (0, u'eo'): 3, (0, u'sl'): 34}


In [130]:
OHEdict[(0,'politics')]

12

Create OHETrainData.

In [132]:
OHETrainData = rawTrainData.map(lambda (label, t1, t2, t3, sr):
                                       (label, t1, t2, t3, (OHEdict[(0,sr)], 1) )
                                )

In [133]:
OHETrainData.take(10)

[(0, (0, 1), (1, 0.0), (2, 0), (25, 1)),
 (1, (0, 11), (1, 0.0), (2, 0), (25, 1)),
 (1, (0, 36), (1, -11.0), (2, 811), (25, 1)),
 (0, (0, 17), (1, -5.0), (2, 1151), (25, 1)),
 (1, (0, 39), (1, -11.0), (2, 1705), (25, 1)),
 (0, (0, 112), (1, -11.0), (2, 4964), (25, 1)),
 (1, (0, 78), (1, -3.0), (2, 5959), (25, 1)),
 (0, (0, 98), (1, -5.0), (2, 11762), (25, 1)),
 (0, (0, 66), (1, -8.0), (2, 15666), (25, 1)),
 (0, (0, 9), (1, 0.0), (2, 17470), (25, 1))]

(Create OHEValData and OHETestData; skip for now)

# 8.0 Run logistic regression

# ALYSSA'S IPYTHON NOTEBOOK BELOW

In [2]:
DATABASE = 'reddit'

In [3]:
engine = create_engine("mysql+pymysql://root@localhost/" + str(DATABASE))
con = engine.connect()
dataid = 1022

In [4]:
redditDB = con.execute("SELECT * FROM comments6")

In [5]:
reddit = pd.DataFrame(redditDB.fetchall())
reddit.columns = redditDB.keys()

In [6]:
reddit.head()

Unnamed: 0,commentAuthor,commentCreated,commentID,commentLink,commentScore,postID,subreddit,postAuthor,postBody,postCreated,...,NegRaw,vNegRaw,vPos,Pos,Neg,vNeg,posNegRatio,posNegDiff,commentBody2,commentLengthSW
0,Ajunadeeps,1432770000,crmzgnx,http://www.reddit.com/r/leagueoflegends/commen...,586,37ihzo,leagueoflegends,DBlackjack21,http://gfycat.com/HopefulGleefulHoiho,1432770000,...,0,0,,,,,0.0,0.6,Best backdoor since gay porn.,5
1,DBlackjack21,1432780000,crn46mg,http://www.reddit.com/r/leagueoflegends/commen...,441,37ihzo,leagueoflegends,DBlackjack21,http://gfycat.com/HopefulGleefulHoiho,1432770000,...,0,0,,,,,0.0,0.2,[Objection!](http://gfycat.com/OptimisticSkinn...,5
2,ThePowerOfAura,1432780000,crn58hk,http://www.reddit.com/r/leagueoflegends/commen...,325,37ihzo,leagueoflegends,DBlackjack21,http://gfycat.com/HopefulGleefulHoiho,1432770000,...,0,0,,,,,0.0,0.166667,"Dude and I thought Sion was easy to play, clea...",18
3,SkyhuntL,1432790000,crn9iqq,http://www.reddit.com/r/leagueoflegends/commen...,125,37ihzo,leagueoflegends,DBlackjack21,http://gfycat.com/HopefulGleefulHoiho,1432770000,...,0,0,,,,,,0.0,"Or a driving license, that is.",7
4,offlightsedge,1432790000,crnc5oz,http://www.reddit.com/r/leagueoflegends/commen...,33,37ihzo,leagueoflegends,DBlackjack21,http://gfycat.com/HopefulGleefulHoiho,1432770000,...,0,0,,,,,0.0,0.333333,Preparing for a wide turn like that looks like...,12


In [7]:
reddit['timeSince'].describe()

count     253519.000000
mean       32795.809746
std        57652.954846
min            0.000000
25%         9216.500000
50%        24702.000000
75%        43076.000000
max      3279309.000000
Name: timeSince, dtype: float64

In [8]:
reddit['commentScore'].describe()

count    287829.000000
mean          6.800795
std          34.962045
min        -416.000000
25%           0.000000
50%           1.000000
75%           4.000000
max        2143.000000
Name: commentScore, dtype: float64

In [9]:
redditOrig = reddit

In [10]:
redditA = reddit[['commentID', 'subreddit', 'commentScore', 'timeSince', 'commentLength', 'posNegDiff']]

In [11]:
def perc(subreddit):
    return (round(np.percentile(reddit[reddit['subreddit'] == subreddit]['commentScore'], 3, axis = 0)), round(np.percentile(reddit[reddit['subreddit'] == subreddit]['commentScore'], 97, axis = 0)))

In [12]:
LoLper = perc('leagueoflegends')
GGper = perc('GirlGamers')
picsper = perc('pics')
polper = perc('politics')

In [13]:
print(LoLper)
print(GGper)
print(picsper)
print(polper)

(-7.0, 66.0)
(0.0, 26.0)
(-5.0, 60.0)
(-6.0, 24.0)


In [14]:
def score(subreddit, per):
    data = redditA[redditA['subreddit'] == subreddit]
    data['scoreC'] = np.nan
    data['scoreC'][data['commentScore'] <= per[0]] = 0
    data['scoreC'][data['commentScore'] >= per[1]] = 1
    return data

In [15]:
LoL = score('leagueoflegends', LoLper)
GG = score('GirlGamers', GGper)
pic = score('pics', picsper)
pol = score('politics', polper)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the ca

In [16]:
print LoL['scoreC'].describe()
print GG['scoreC'].describe()
print pic['scoreC'].describe()
print pol['scoreC'].describe()

count    6256.000000
mean        0.468191
std         0.499027
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: scoreC, dtype: float64
count    499.000000
mean       0.386774
std        0.487500
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: scoreC, dtype: float64
count    5535.000000
mean        0.487986
std         0.499901
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: scoreC, dtype: float64
count    3763.000000
mean        0.484985
std         0.499841
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: scoreC, dtype: float64


In [17]:
redditB = pd.concat([LoL, GG, pic, pol])

In [18]:
redditB.groupby('scoreC').mean()

Unnamed: 0_level_0,commentScore,timeSince,commentLength,posNegDiff
scoreC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-16.081737,20443.331973,30.225986,-0.016371
1,142.337997,10461.994676,27.414966,0.005304


In [19]:
y, X = dmatrices('scoreC ~ C(subreddit) + timeSince + commentLength + posNegDiff', redditB, return_type = "dataframe")
print X.columns

Index([u'Intercept', u'C(subreddit)[T.leagueoflegends]', u'C(subreddit)[T.pics]', u'C(subreddit)[T.politics]', u'timeSince', u'commentLength', u'posNegDiff'], dtype='object')


In [20]:
X = X.rename(columns = {'C(subreddit)[T.leagueoflegends]': 'subLoL', 
                        'C(subreddit)[T.pics]': 'subPics', 
                        'C(subreddit)[T.politics]': 'subPol'})

In [21]:
y = np.ravel(y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [23]:
model = LogisticRegression()
model = model.fit(X_train, y_train)

In [24]:
f = open('model', 'w')
cPickle.dump(model, f)
f.close()

In [25]:
pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))

Unnamed: 0,0,1
0,Intercept,[0.410245670233]
1,subLoL,[0.0583930466606]
2,subPics,[0.244273666339]
3,subPol,[0.100347697994]
4,timeSince,[-7.30151157008e-05]
5,commentLength,[-0.000133795596042]
6,posNegDiff,[0.0322472638634]


In [26]:
predicted = model.predict(X_test)
print predicted

[ 1.  1.  1. ...,  0.  0.  1.]


In [27]:
probs = model.predict_proba(X_test)
print probs

[[ 0.26103584  0.73896416]
 [ 0.45698479  0.54301521]
 [ 0.49617659  0.50382341]
 ..., 
 [ 0.62978508  0.37021492]
 [ 0.79438198  0.20561802]
 [ 0.27561929  0.72438071]]


In [28]:
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

0.646433632499
0.709111871693


In [29]:
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

[[1400  806]
 [ 691 1337]]
             precision    recall  f1-score   support

        0.0       0.67      0.63      0.65      2206
        1.0       0.62      0.66      0.64      2028

avg / total       0.65      0.65      0.65      4234



In [30]:
model.predict_proba(np.array([1, 0, 0, 0, 2*60*60, 30, .001]))

array([[ 0.42780781,  0.57219219]])

In [31]:
model.predict_proba(np.array([1, 0, 0, 0, 10*60*60, 10, -.1]))

array([[ 0.85967768,  0.14032232]])

In [32]:
model.predict_proba(np.array([0, 0, 1, 0, 1*60*60, 10, 2]))

array([[ 0.38821503,  0.61178497]])