# Interactive Analysis
## Imports & Constants

In [1]:
import numpy as np
import pandas as pd
from pandas.core import datetools
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats
from textstat.textstat import textstat
import pprint, math, datetime, pandas.tseries

from tools.utilities import compose, identity
from tools.grouping import question_type, has_negation, concrete_score, concrete_score_avg, is_viral

%matplotlib inline
%run -i tools/datajson.py
# Ensures higher quality graphs!
%config InlineBackend.figure_format = 'retina'

  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
# File Names
NAMES = ['2017-10-02', '2017-10-06', '2017-10-16', '2017-10-25', '2017-11-01', 
        '2017-11-13', '2017-11-18', '2017-11-25', '2017-12-06', '2017-12-16',
        '2017-12-25', '2018-01-04', '2018-01-17', '2018-01-24', '2018-02-01']
CURRENTS = ['../private/output/' + name + '.txt' for name in NAMES]

# Lists
SUBJECTS = ['Mathematics', 'Biology', 'Economics', 'Culture', 'Chemistry',
            'Physics', 'Engineering', 'Technology', 'Repost', 'Other']
Q_TYPES = ['How', 'Why', 'What', 'When']
NEGATION = ['Positive', 'Negative']

##  Load
The concreteness of a question is defined by the function `concrete_score` is the sum of the concrete scores of each of the words, if available in the database. The `concrete_score_avg` function controls for the number of words by normalizing or averaging the number of words which are counted.

In [3]:
# Collects (parses) posts and displats size
collected_posts = RedditDataJSON.from_filenames(CURRENTS)
raw_scores = {post['title']: int(post['score']) for post in collected_posts.posts}
score_distribution = [int(post['score']) for post in collected_posts.posts]
collected_posts.size

JSONDecodeError: Expecting ',' delimiter: line 8937 column 2 (char 468020)

## Simple Graphs
A series of examples which highlight the possible graphsusing the `RedditJSONData` Class.

In [4]:
# Score distributions
stdev = np.std(score_distribution)
mean = np.mean(score_distribution)
print(stats.describe(score_distribution))

plt.hist(score_distribution)
plt.show()

# Threshold
VIRAL_THRESHOLD = mean + 0.5 * stdev

NameError: name 'score_distribution' is not defined

In [5]:
# Simple Scatter
out = collected_posts.plot_post_scatter('title', 'score',
    textstat.syllable_count, compose(math.log, int))
pprint.pprint(out)

# Question Type Comparison
out = collected_posts.compare_groups(Q_TYPES, question_type,
    'title', 'score', int)
pprint.pprint(out)

# Negation Comparison
out = collected_posts.compare_groups(NEGATION, has_negation,
    'title', 'score', int)
pprint.pprint(out)

# Categorical Counts
out = collected_posts.categorical_counts('link_flair_text', identity)
pprint.pprint(out)

# Percentile Comparison
out = collected_posts.post_perc_groups(0.2, 'score', int, 'title',
    concrete_score)
pprint.pprint(out)

out = collected_posts.post_perc_groups(0.2, 'score', int, 'title', 
    textstat.syllable_count)
pprint.pprint(out)

# Negation Comparison (Binary)
out = collected_posts.compare_groups(NEGATION, has_negation,
    'title', 'score', lambda sc: is_viral(int(sc), VIRAL_THRESHOLD))
pprint.pprint(out)

NameError: name 'collected_posts' is not defined

## Randomness Testing 

In [6]:
# Utilities

def unix_to_week_time(time):
    date_obj = datetime.datetime.fromtimestamp(time)
    day_obj = date_obj.date()
    time_obj = date_obj.time()
    return day_obj.weekday() * 86400 + \
        time_obj.hour * 3600 + \
        time_obj.minute * 60 + \
        time_obj.second

def unix_to_day_time(time):
    date_obj = datetime.datetime.fromtimestamp(time)
    time_obj = date_obj.time()
    return time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second


def unix_to_hour_time(time):
    date_obj = datetime.datetime.fromtimestamp(time)
    time_obj = date_obj.time()
    return time_obj.minute * 60 + time_obj.second

In [7]:
number = len([score for score in score_distribution if score > VIRAL_THRESHOLD])
print(number, 'posts classified as viral with score greater than', VIRAL_THRESHOLD)

times = [float(post['created_utc']) for post in collected_posts.posts]
viral = [int(int(post['score']) > VIRAL_THRESHOLD) for post in collected_posts.posts]


# Logistic Regression
plt.scatter(list(map(unix_to_week_time, times)), viral)
# plt.scatter(list(map(unix_to_hour_time, times)), viral)
# plt.scatter(list(map(unix_to_day_time, times)), viral)
plt.show()

NameError: name 'score_distribution' is not defined

# Export Template

In [8]:
EXPORT_LOC = '../private/output/export.csv'
export_pairs = sorted(raw_scores.items(), key=lambda pair: pair[1], reverse=True)
export_data = pd.DataFrame(export_pairs)
shuffled = export_data[[1, 0]]
shuffled.columns = ['title', 'score']
shuffled.to_csv(EXPORT_LOC)

NameError: name 'raw_scores' is not defined