In [54]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pyspark
import numpy as np
import string
import re
from pyspark.sql.functions import isnan, when, count, col

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql import Row
import pickle
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))

In [2]:
df = spark.read.json(
    's3://aws-logs-816063959671-us-east-1/data/tldr-training-data.jsonl')

In [3]:
df.cache()

DataFrame[author: string, body: string, content: string, content_len: bigint, id: string, normalizedBody: string, subreddit: string, subreddit_id: string, summary: string, summary_len: bigint, title: string]

In [12]:
df.count()

3084410

In [11]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



# Create a subset for testing

In [24]:
subset = df.sample(withReplacement=False, fraction=0.1)
subset.cache()
subset.count()

307772

# Investigating Columns

In [93]:
subset.select('body','content', 'id', 'normalizedBody', 'subreddit', 'title').show(5)

+--------------------+--------------------+-------+--------------------+---------------+--------------------+
|                body|             content|     id|      normalizedBody|      subreddit|               title|
+--------------------+--------------------+-------+--------------------+---------------+--------------------+
|This is back when...|This is back when...|c6cklgh|This is back when...|      AskReddit|                null|
|There's no defini...|There's no defini...|c6czxmv|There's no defini...|            tf2|                null|
|If this Plan B is...|If this Plan B is...|c6de3nf|If this Plan B is...|TwoXChromosomes|                null|
|It doesn't sound ...|It doesn't sound ...|c6elk21|It doesn't sound ...|  YouShouldKnow|                null|
|"Going for the fr...|Going for the fre...|c6gyhwn|"Going for the fr...|      askseddit|Getting mixed mes...|
+--------------------+--------------------+-------+--------------------+---------------+--------------------+
only showi

In [91]:
subset.select([count(when(isnan(c), c)).alias(c) for c in subset.columns]).show()

+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+-----+
|author|body|content|content_len| id|normalizedBody|subreddit|subreddit_id|summary|summary_len|title|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+-----+
|     0|   0|      0|          0|  0|             0|        0|           0|      0|          0|    0|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+-----+



In [92]:
subset.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in subset.columns]).show()

+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+
|author|body|content|content_len| id|normalizedBody|subreddit|subreddit_id|summary|summary_len| title|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+
|     0|   0|      0|          0|  0|             0|       14|          14|      0|          0|187800|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+



In [98]:
print('{}% of the subset is missing reddit post titles'.format(
    round(subset.filter(subset["title"].isNull()).count()/subset.count()*100)))

61% of the subset is missing reddit post titles


## 14 entries have null subreddits. Investigate further

#### Also, explore the difference between body, content, and normalized body

Going to look at each of these entries where subreddit is null and make sure nothing is wrong with them, as well as use this to understand the differences between textual features

In [141]:
subreddit_null = subset.filter(subset["subreddit"].isNull()).toPandas()

In [143]:
for i in range(14):
    print('index: ' + str(i))
    print('Body:')
    print(subreddit_null.loc[i,'body'])
    print('-'*60)
    print('Content:')
    print(subreddit_null.loc[i,'content'])
    print('-'*60)
    print('Normalized Body:')
    print(subreddit_null.loc[i,'normalizedBody'])
    print('-'*60)
    print('Summary:')
    print(subreddit_null.loc[i,'summary'])
    print('-'*60)
    print('-'*60)

index: 0
Body:
#**How To Enter**

1. Visit [

2. Create an eBay account (or just login)
3. Using *buy now* items, build your dream eBay collection, but keep it under $1,000 or it won't count.
4. Then come back to this thread, and submit your [2] best collections as top level comments. Submissions that are not top level comments will not count. If you're submitting two collection, please put them in two different comments.
5. The MFA community (r/MaleFashionAdvice) will judge your submission by voting on your comment.
6. We will choose [3] winners based on the number of upvotes using the top sort.

-------------

**Contest ends on July 20^th, 2014 and winners will be announced on July 31^st, 2014. We will direct message the winners via their reddit username. See the complete rules here: - 

---------------------

#**Prizes**

##1^st Place - wins their entire wardrobe up to $1,000.
##2^nd Place - wins $500 towards their dream wardrobe.
##3^rd Place - wins $250 towards their dream wardrob

These all happen to be spam posts, so for cleaning, all rows where subreddit is null should be dropped. 

In terms of what these textual features are:
* __body__ appears to be the original post, with the tl;dr (aka 'too long; didn't read', reddit's term for a summary)
* __content__ is a cleaned version of __body__, without the tl;dr
* __normalizedBody__ is a cleaned version of __body__ (with the tl;dr)
* __summary__ is simply the tl;dr

An approach for EDA should be to confirm that content length is the same length as the normalizedBody without the tl;dr/summary. Summary's with less than 2-3 words should be reviewed and potentially dropped, since I don't want to summarize entries with only a 2-3 words.

# Exploratory Data Analysis

### Exploring subreddits

In [47]:
nunique_subreddit = df.select("subreddit").distinct().count()
total_len_subreddit = df.select("subreddit").count()
print('There are {} different subreddits, which is roughly {}% of the dataframe.'.format(
        nunique_subreddit, (round(nunique_subreddit/total_len_subreddit*100,2))))

There are 29740 different subreddits, which is roughly 0.96% of the dataframe.


In [144]:
subset.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in subset.columns]).show()

+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+
|author|body|content|content_len| id|normalizedBody|subreddit|subreddit_id|summary|summary_len| title|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+
|     0|   0|      0|          0|  0|             0|       14|          14|      0|          0|187800|
+------+----+-------+-----------+---+--------------+---------+------------+-------+-----------+------+



There are 14 null subreddits (which were identified as spam above), which should be dropped

In [152]:
# Filters out all rows where subreddit is null
subset = subset.filter(df.subreddit.isNotNull())

### Exploring summary length

In [154]:
subset.describe('summary_len').show()

+-------+------------------+
|summary|       summary_len|
+-------+------------------+
|  count|            307758|
|   mean| 25.83364201742928|
| stddev|24.568501131492884|
|    min|                 1|
|    max|               377|
+-------+------------------+



Some of these have a min of 1 word, which is far too short (even 2-5 words may be too short), while some have 377 words, which is too long.

### Examining summarys with less than 5 words

In [185]:
print('{} summaries with 1 word'.format(subset.filter(subset.summary_len == 1).count()))
print('{} summaries with less than 5 words'.format(subset.filter(subset.summary_len < 5).count()))
print('{}% of all entries have less than 5 words'.format(
    round(subset.filter(subset.summary_len < 5).count()/subset.count()*100,2)))

1849 summaries with 1 word
15819 summaries with less than 5 words
5.14% of all entries have less than 5 words


In [162]:
subset.select('summary').filter(subset.summary_len == 1).show(5)

+----------+
|   summary|
+----------+
|    Skype.|
|Roguelikes|
|Childbirth|
|       No.|
|        no|
+----------+
only showing top 5 rows



In [174]:
subset.select('summary').filter((subset.summary_len == 4)).show(5, truncate=False)

+---------------------------+
|summary                    |
+---------------------------+
|PoV =/= Major Character.   |
|OP did say enough.         |
|coyotes are too forgiving. |
|Max is getting scapegoated.|
|Chased by crazy hoodlums   |
+---------------------------+
only showing top 5 rows



Length of 4 is getting better, but still too short

In [176]:
subset.select('summary').filter((subset.summary_len == 5)).show(truncate=False)

+--------------------------------------------+
|summary                                     |
+--------------------------------------------+
|I like Wellington a lot                     |
|it's a generally sketchy place.             |
|Knowing where ones towel is                 |
|The end justify the means.                  |
|I'm good at knowing time.                   |
|don't sweat the small stuff.                |
|Chicks dig scars. And beards.               |
|Doctors are fucking stupid sometimes.       |
|I can't last hit QQ                         |
|Defenseless man beat completely senseless.  |
|You are what you eat.                       |
|Stang289GT is credit to team.               |
|Harmful? Maybe.  Beneficial? Definitely not.|
|fuck it everyone is right                   |
|Try getting a scout/assassin follower.      |
|you're not your fucking khakis              |
|Ferraris uses fuel, not gasoline            |
|The "right" reason is subjective.           |
|I paid $20 f

Length of 5 words is a good minimum cut off for cleaning, especially since this is only 5% of our data

### Examining summarys with too many words

In [187]:
subset.select('summary').filter((subset.summary_len == 377)).count()

1

In [188]:
subset.select('summary').filter((subset.summary_len == 377)).first()

Row(summary="In general  any tank which may feel high expected for me  will feel normal for an average player  and low for a bad player.   There are a few exceptions      Foch 155   The only reason this is still high is because so many of the games played over the existence of this tank were before the great nerfing     VK 72.01K   This is higher than the E 100 in spite of being blatantly worse.     STB 1   Higher than just about any med  sure the STB is pretty great  but I wouldn't rate it as the best med.     215b   One of the best tanks in the game  has lower expected than the E 100  this is in a large part due to how the E 100 is idiot proof  and thus friendier on noobs  while the 215b is not.     FV4202   Why isn't this the lowest in tier  instead it's almost the same as the 215b  which is about as good as the 4202 is bad.     E5 expected is still way too low  and it will go up  even if the tank is nerfed.     113 is higher than the maus. Maus is pretty decent  113 is certainly no