# Hacker News Data Processing

In [2]:
#!pip install textblob
from textblob import TextBlob
import bokeh
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import html 
import dask
import re
import dask.dataframe as dd
from tqdm import tqdm_pandas
from tqdm import tqdm_notebook as tqdm
import json

def save_df(df):
    df.to_feather('data/df_save.feather')
    print('Dataframe Saved')
    
def load_df():
    pd.read_feather('data/df_save.feather')
    print('Dataframe Loaded')
    
tqdm_pandas(tqdm())

#!pip install line_profiler
# Load the profiler into your Jupyter notebook
%load_ext line_profiler

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## This time I ran the query in Google BigQuery and copied the files to Google Cloud Storage

In [None]:
#dask kept giving me errors. 
d0 = pd.read_csv('data/hn_main_query/hacker_news_full_comments0.csv',engine='python')
print("dataframe1")
d1 = pd.read_csv('data/hn_main_query/hacker_news_full_comments1.csv',engine='python')
print("dataframe2")
d2 = pd.read_csv('data/hn_main_query/hacker_news_full_comments2.csv',engine='python')
print("dataframe3")
d3 = pd.read_csv('data/hn_main_query/hacker_news_full_comments3.csv',engine='python')
print("dataframe4")
d4 = pd.read_csv('data/hn_main_query/hacker_news_full_comments4.csv',engine='python')
print("dataframe5")
d5 = pd.read_csv('data/hn_main_query/hacker_news_full_comments5.csv',engine='python')
print("dataframe6")
d6 = pd.read_csv('data/hn_main_query/hacker_news_full_comments6.csv',engine='python')
print("dataframe7")
d7 = pd.read_csv('data/hn_main_query/hacker_news_full_comments7.csv',engine='python')
print("dataframe8")
d8 = pd.read_csv('data/hn_main_query/hacker_news_full_comments8.csv',engine='python')
print("dataframe9")
d9 = pd.read_csv('data/hn_main_query/hacker_news_full_comments9.csv',engine='python')

In [None]:
df = pd.concat([d0, d1, d2, d3, d4, d5, d6, d7, d8, d9])

### Inspect shape of completed dask csv import. Verify that all rows are present. (15,825,859) 

In [None]:
df.shape

In [None]:
df.head(10)

### Save query results to CSV.

In [None]:
%%time
df.to_csv('data/hn_commentors_all_new.csv')

### Read CSV back into new Dataframe.

In [None]:
%%time
ds2 = pd.read_csv('data/hn_commentors_all_new.csv')

### Inspect and verify that all rows are present.

In [None]:
print(ds2.shape)
display(ds2.head(3))

### Remove all  `author` and `text` NaN rows from Dataframe. 

In [None]:
nans = ds2.text.isna().sum()
print('This many nans:', nans)
ds2 = ds2.dropna(subset=['commentor', 'text'])
print('New Shape after nan removal:', ds2.shape)

In [None]:
nans = ds2.parent_type.isna().sum()
print('This many parent_type nans:', nans)
nans = ds2.story_title.isna().sum()
print('This many story_title nans:', nans)

### Remove `Unnamed: 0` Column and fill in empty Titles for replies on comment. 

In [None]:
%%time
ds2 = ds2.loc[:, ~ds2.columns.str.match('Unnamed')]
ds2['story_title'] = ds2.story_title.fillna('Another Comment')

In [None]:
nans = ds2.story_title.isna().sum()
print('This many story_title nans:', nans)
display(ds2.head(3))

## Sentiment Analysis and Text Cleaning

### Define utility functions

In [None]:
def encode_decode(text):
    """
    Utility function to clean text by decoding HTML text.
    """
    unescaped = html.unescape(text)
    return unescaped

def noHTML(text):
    """
    Utility function to clean text by removing HTML flags.
    """
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', text)
    return cleantext

def noURLS(text):
    """
    Utility function to clean text by removing links
    using simple regex statements.
    """
    return ''.join(re.sub(r"http\S+", "", text))


def get_sentiment(text):
    """
    Utility function to classify sentiment of passed text
    using textblob's sentiment method. Return the polarity
    score as a float within the range [-1.0, 1.0]
    
    The polarity score is a float within the range [-1.0, 1.0] 
    where negative value indicates negative text 
    and positive value indicates that the given 
    text is positive.

    The subjectivity is a float within the range [0.0, 1.0] 
    where 0.0 is very objective and 1.0 is very subjective.
    """
    analysis = TextBlob(text).sentiment
    polarity = analysis.polarity
    subjectivity = analysis.subjectivity
    return polarity, subjectivity

### Apply text cleaning to comment texts and create new column in Dataframe

In [None]:
tqdm_pandas(tqdm())

In [None]:
ds2['cleaned_comment'] = ds2.text.progress_apply(lambda x: noURLS(noHTML(encode_decode(x))))
ds2['cleaned_title'] = ds2.story_title.progress_apply(lambda x: noURLS(noHTML(encode_decode(x))))

### Apply sentiment analysis (TextBlob.polarity) to each cleaned Comment text. 

In [None]:
ds2['comment_sentiment'] = ds2.cleaned_comment.progress_apply(get_sentiment)

In [None]:
ds3 = ds2.loc[:, ~ds2.columns.str.match('Unnamed')]
ds3.head()

### Save to CSV 

In [None]:
ds3.to_csv('data/hn_all_w_sentiment_cleaned.csv',index=False)

In [None]:
print(ds3.shape)
#display(ds3.head(1))

In [None]:
dsx = ds3.drop(columns=['text', 'story_title'])

In [None]:
sdf = pd.DataFrame(dsx['comment_sentiment'].tolist(), index=dsx.index)
sdf.columns = ['polarity', 'subjectivity']
sdf.head()

In [None]:
dsx["comment_polarity"] = sdf.polarity
dsx["comment_subjectivity"] = sdf.subjectivity
dsx = dsx.drop(columns=['comment_sentiment'])
dsx.head(1)

In [None]:
%%time
dsx.to_csv('data/hn_all_w_sentiment_cleaned_inplace.csv',index=False)

## Load cleaned / analyzed data back into dataframe from CSV - X

In [3]:
%%time
# IMPORT FROM CSV's
ds4 = pd.read_csv('data/hn_all_w_sentiment_cleaned_inplace.csv')
print(ds4.shape)

(15397309, 19)
CPU times: user 1min 41s, sys: 12.2 s, total: 1min 53s
Wall time: 1min 42s


In [4]:
ds4 = ds4.loc[:, ~ds4.columns.str.match('Unnamed')]

In [5]:
nans = ds4.ranking.isna().sum()
print('This many nans:', nans)

This many nans: 15397309


### Oops, looks like ranking wasn't actually on that BigQuery table even though the field is there. I'll need to pull it in and merge it here by commentid from the comments table. 

After a bit of investigation I found that the table `bigquery-public-data.hacker_news.full_201510` does contain comment ranking type entries, but the `bigquery-public-data.hacker_news.full` (the one that is continuously updated) does not. 

For the sake of having data to do deeper analysis I'm going to add in the comment_ranking data as a column eventually, but not calculate any summary stats off it for the API. 

In [6]:
comment_ranking_df = pd.read_csv("data/hn_comment_ranking_query.csv")
comment_ranking_df = comment_ranking_df[['id','ranking']].copy()
comment_ranking_df.set_index('id')
comment_ranking_df.head(3)

Unnamed: 0,id,ranking
0,9997338,0
1,9997580,0
2,9998036,0


### Add in the missing ranking data

In [96]:
%%time
ds5 = ds4.drop(columns=['ranking'])
ds5 = ds5.merge(comment_ranking_df, how='left', left_on='commentid', right_on='id')

CPU times: user 54.7 s, sys: 9.66 s, total: 1min 4s
Wall time: 33.7 s


In [97]:
nans = ds5.ranking.isna().sum()
print('This many nans:', nans)
print(ds5.columns)

This many nans: 7226283
Index(['commentor', 'comment_time', 'commentid', 'parentid', 'comment_deleted',
       'comment_dead', 'author', 'score', 'story_time', 'parent_type',
       'parents_parent', 'parent_deleted', 'parent_dead', 'num_children',
       'cleaned_comment', 'cleaned_title', 'comment_polarity',
       'comment_subjectivity', 'id', 'ranking'],
      dtype='object')


## Aggregate commentors' sentiment statistics and make final dataframe.

#### Rename columns so API (JSON) is easier to read. 

In [98]:
%%time
ds5 = ds5.rename(columns={'author': 'parent_author', 'cleaned_title': 'parent_title','score': 'parent_score', 'story_time': 'parent_time', 'ranking': 'comment_rank','commentid':'comment_id','parentid':'parent_id'})
ds5 = ds5.drop(columns=['id'])

CPU times: user 37.9 s, sys: 13.8 s, total: 51.6 s
Wall time: 22.5 s


In [99]:
ds5.columns

Index(['commentor', 'comment_time', 'comment_id', 'parent_id',
       'comment_deleted', 'comment_dead', 'parent_author', 'parent_score',
       'parent_time', 'parent_type', 'parents_parent', 'parent_deleted',
       'parent_dead', 'num_children', 'cleaned_comment', 'parent_title',
       'comment_polarity', 'comment_subjectivity', 'comment_rank'],
      dtype='object')

### Normalize comment subjectivity from sub `-1 to 1` to obj. Create booleans for +/- classes. 

In [100]:
%%time
def SentimentHelpers(df):
    df['comment_subjectivity'] = df['comment_subjectivity'].multiply(-1).add(.5).multiply(-2)
    df['is_subjective'] = df['comment_subjectivity'].map(lambda x: True if (x < 0) else False)
    df['is_negative'] = df['comment_polarity'].map(lambda x: True if (x < 0) else False)
    print ("Sentiment helpers created...")


%lprun -f SentimentHelpers z = SentimentHelpers(ds5)


"""
Total time: 13.7587 s
File: <ipython-input-29-425f0a3a30a2>
Function: SentimentHelpers at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def SentimentHelpers(df):
     2         1     183019.0 183019.0      1.3      df['comment_subjectivity'] = df['comment_subjectivity'].multiply(-2).add(1)
     3         1    6802012.0 6802012.0     49.4      df['is_subjective'] = df['comment_subjectivity'].map(lambda x: True if (x < 0) else False)
     4         1    6773480.0 6773480.0     49.2      df['is_negative'] = df['comment_polarity'].map(lambda x: True if (x < 0) else False)
     5         1        159.0    159.0      0.0      print ("Sentiment helpers created...")
""";


Sentiment helpers created...
CPU times: user 24.5 s, sys: 1.93 s, total: 26.4 s
Wall time: 13.6 s


In [101]:
display(ds5.shape)
display(ds5.iloc[0:2, -4:])

(15397309, 21)

Unnamed: 0,comment_subjectivity,comment_rank,is_subjective,is_negative
0,-1.0,,True,False
1,0.279167,1.0,False,False


### Create quadrant column for categorical class for use in Groupby function.

In [102]:
%%time

def DetermineQuadrant(df):
    """
    Accepts DF
    Creates Column
    """
    df['polarity'] = df['comment_polarity'].map(lambda x: 'neg' if (x < 0) else 'pos')
    df['basis'] = df['comment_subjectivity'].map(lambda x: 'sub' if (x < 0) else 'obj')
    df = df.assign(quadrant=[str(x) + '_' + str(y) for x, y in zip(df['polarity'], df['basis'])])
    df = df.drop(columns=['polarity','basis'])
    return df

%lprun -f DetermineQuadrant ds5 = DetermineQuadrant(ds5)

"""
Total time: 58.586 s
File: <ipython-input-70-74685f1c6fc3>
Function: DetermineQuadrant at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def DetermineQuadrant(df):
     6         1    7027143.0 7027143.0     12.0      df['polarity'] = df['comment_polarity'].map(lambda x: 'neg' if (x < 0) else 'pos')
     7         1    6990340.0 6990340.0     11.9      df['basis'] = df['comment_subjectivity'].map(lambda x: 'sub' if (x < 0) else 'obj')
     8         1   27863275.0 27863275.0     47.6      df = df.assign(quadrant=[str(x) + '_' + str(y) for x, y in zip(df['polarity'], df['basis'])])
     9         1   16705239.0 16705239.0     28.5      df = df.drop(columns=['polarity','basis'])
    10         1         18.0     18.0      0.0      return df
"""

# This was helpful: https://stackoverflow.com/questions/11858472/string-concatenation-of-two-pandas-columns

CPU times: user 1min 3s, sys: 10.7 s, total: 1min 14s
Wall time: 58.7 s


In [103]:
display(ds5.shape)
display(ds5.iloc[0:2, -4:])

(15397309, 22)

Unnamed: 0,comment_rank,is_subjective,is_negative,quadrant
0,,True,False,pos_sub
1,1.0,False,False,pos_obj


### Send every row of these columns into a Json string. 

In [104]:
%%time

def createCommentJSONrecords(df):
    """
    Saves filtered dataframe columns as a json object oriented by row records. 
    Decodes the JSON string into a list containing 1 JSON object per row.
    Adds new column in the dataframe that stores the row's JSON Object.
    """
    saved = (df[['commentor','comment_time','comment_polarity','comment_subjectivity', 
                'is_subjective', 'is_negative','quadrant','parent_type','parent_author','parent_title',
                'cleaned_comment','comment_rank','comment_id','parent_id']].to_json(orient='records'))
    decoded = json.JSONDecoder().decode(saved)
    df['comment_JSON'] = decoded
    print( "JSON Uploaded")

%lprun -f createCommentJSONrecords z = createCommentJSONrecords(ds5)

"""
CREATING JSON RECORDS
Successful Run on df.shape = (15397309, 23)
Total time: 134.944 s
File: <ipython-input-23-68de41c26213>
Function: createCommentJSONrecords at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def createCommentJSONrecords(df):                                              
     7         1          3.0      3.0      0.0      saved = (df[['commentor','comment_time','comment_polarity','comment_subjectivity', 
     8         1          1.0      1.0      0.0                   'is_subjective', 'is_negative','parent_type', 'parent_author','parent_title',
     9         1   51285889.0 51285889.0    38.0                  'cleaned_comment','ranking','commentid','parentid']].to_json(orient='records'))
    10         1   82673003.0 82673003.0    61.3     decoded = json.JSONDecoder().decode(saved)
    11         1     985481.0 985481.0      0.7      df['comment_JSON'] = decoded
    12         1          2.0      2.0      0.0      return "JSON Uploaded"

""";


JSON Uploaded
CPU times: user 2min 6s, sys: 19.1 s, total: 2min 25s
Wall time: 2min 25s


In [105]:
display(ds5.shape)
display(ds5.iloc[0:2, -4:])

(15397309, 23)

Unnamed: 0,is_subjective,is_negative,quadrant,comment_JSON
0,True,False,pos_sub,"{'commentor': 'Twisell', 'comment_time': 15489..."
1,False,False,pos_obj,"{'commentor': 'camus2', 'comment_time': 139645..."


### I have 4 variables I want for plotting in JS:
* 1. Polarity - Between -1 and 1. 
* 2. Subjectivity - Between -1 and 1. 
* 3. Time - UNIX time for the comment.
* 4. Comment ID. 

I'll filter a DataFrame then use pandas.DataFrame.to_numpy()
That will turn them into a numpy array. Like this:
`array([[1. , 3. ], [2. , 4.5]])`
Then I'll use `numpy.ndarray.tolist()` to change the array to a list of lists. 
Last, I'll create a new column and place my np array in it.
That will make it so I can group them all later. 


In [115]:
%%time

def createPolaritySubjectivtyPairs(df):
    """
    Pairs Polarity / Subjectivity points for plotting by row.
    Pairs UNIX Epoch time / commentid points for plotting by row.
    Combines both pairs into one list for each row.  
    """
    df["polr_subj"] = df[['comment_polarity','comment_subjectivity']].to_numpy().tolist()
    df["time_id"] = df[['comment_time','comment_id']].to_numpy().tolist()
    df['polr_subj_time_id'] = df[['polr_subj','time_id']].to_numpy(dtype='object').tolist()
    df = df.drop(columns=['polr_subj','time_id'])
    print("Plot Pairs Created")
    return df

%lprun -f createPolaritySubjectivtyPairs ds5 = createPolaritySubjectivtyPairs(ds5)


"""
Total time: 78.0186 s
File: <ipython-input-101-5f45d885fea7>
Function: createPolaritySubjectivtyPairs at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def createPolaritySubjectivtyPairs(df):
     7         1   12855352.0 12855352.0     16.5      df["polr_subj"] = df[['comment_polarity','comment_subjectivity']].to_numpy().tolist()
     8         1   17617648.0 17617648.0     22.6      df["time_id"] = df[['comment_time','commentid']].to_numpy().tolist()
     9         1   29883020.0 29883020.0     38.3      df['polr_subj_time_id'] = df[['polr_subj','time_id']].to_numpy(dtype='object').tolist()
    10         1   17662446.0 17662446.0     22.6      df = df.drop(columns = ["polr_subj", "time_id"])
    11         1        135.0    135.0      0.0      print("Plot Pairs Created")
""";

Plot Pairs Created
CPU times: user 1min 12s, sys: 10.6 s, total: 1min 23s
Wall time: 1min 22s


In [114]:
display(ds5.shape)
display(ds5.iloc[0:2, -4:])

(15397309, 23)

Unnamed: 0,is_subjective,is_negative,quadrant,comment_JSON
0,True,False,pos_sub,"{'commentor': 'Twisell', 'comment_time': 15489..."
1,False,False,pos_obj,"{'commentor': 'camus2', 'comment_time': 139645..."


# TO DO: 

* https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_groups/
* https://stackoverflow.com/questions/22219004/grouping-rows-in-list-in-pandas-groupby

In [None]:
#----- Commentors Summary Table ----#
# Stats for Commenting
#-- 
# Stats for Polarity
# Stats for Subjectivity

# Stats for Polarity (Grouped By Is_Polar?)
# Stats for Subjectivity (Grouped By Is_Subjective?)
# Stats for Polarity (Grouped by Quadrant)

# List of top 10 Saltiest Comments
# List of top 10 Happy Comments
# Rank????? :) 

#----- Seperate Table ------ #
# Top 1000 saltiest comments.
# Top 1000 best comments.
# Top 1000 Salty by Rank
# Top 1000 Best by Rank


# Mostest 


### Work in Progress: 

In [None]:
polarity = ds5['comment_polarity'].groupby(ds5['commentor']).mean()

In [None]:
df['comment_polarity'].groupby([df['commentor'], df['is_positive']]).describe().unstack()

In [None]:
the_groups = ds5_test.groupby('commentor').groups

pd.DataFrame(groups.items())

In [None]:
test = ds5.groupby('commentor').groups

In [None]:
# To group them 
def listEachPolaritySubjectivityPairs(df):
    keys, values = df.sort_values('a').values.T
    ukeys, index = np.unique(keys,True)
    arrays = np.split(values,index[1:])
    df2 = pd.DataFrame({'a':ukeys,'b':[list(a) for a in arrays]})
    return df2


In [None]:
def get_polarity_stats(df):
    commentor_table = (df['comment_polarity'].groupby(df['commentor'], as_index=False).describe()
                       .rename({'25%': 'Q1','50%': 'Median','75%': 'Q3'}, axis='index')
                       .add_prefix('b_all_'))
    return commentor_table

%lprun -f get_polarity_stats z = get_polarity_stats(ds5_test) 

#### Learning to sort the hard way

In [None]:
%time
# Group by commentor to split into multiple tables
df_smallA.groupby("commentor").comment_polarity.describe()

In [None]:
def commentGetter(subdf): ### NOPE
    # Sort the group
    subdf = subdf.sort_values(by=['comment_polarity', 'comment_subjectivity'])
    subdf['salty_comments_ids'] = subdf[["commentid"]][0:10].to_json(orient='records')
    subdf['sweet_comments_ids'] = subdf[["commentid"]].tail(10).to_json(orient='records')
    return subdf

def simpleSort(): ### WINNNER
    subdf = subdf.sort_values(by=['comment_polarity'])
    return subdf

def noSortGetter(subdf): ### ALMOST, STILL SLOW.
    subdf['salty_comments_ids'] = subdf[["commentid"]][0:10].to_json(orient='records')
    subdf['sweet_comments_ids'] = subdf[["commentid"]].tail(10).to_json(orient='records')
    return subdf


In [None]:
# Group (all) and Grab by commentor w/o Sort

def noSortGetter(subdf): ### ALMOST, STILL SLOW. ~117 it/s
    subdf['salty_comments_ids'] = subdf[["commentid"]][0:10].to_json(orient='records')
    subdf['sweet_comments_ids'] = subdf[["commentid"]].tail(10).to_json(orient='records')
    return subdf

%lprun -f noSortGetter z = df_smallA[['commentor','comment_polarity','comment_subjectivity','commentid']].groupby("commentor").progress_apply(noSortGetter)


### Grouping presorted data, then applying a function to take the head or tail of it.
```python
Timer unit: 1e-06 s 

Total time: 276.015 s for 100,000 rows. Not very good. 
File: <ipython-input-76-c004a304a3b0>
Function: noSortGetter at line 3

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     3                                           def noSortGetter(subdf): ### ALMOST, STILL SLOW.
     4     35469  124971308.0   3523.4     45.3      subdf['salty_comments_ids'] = subdf[["commentid"]][0:10].to_json(orient='records')
     5     35469  150983242.0   4256.8     54.7      subdf['sweet_comments_ids'] = subdf[["commentid"]].tail(10).to_json(orient='records')
     6     35469      60223.0      1.7      0.0      return subdf
```

### Just remember though, I started with a for loop.... around ~10 it/s...

```python
# Create a list of the commentor's saltiest comments. 
outdf['salty_comments'] = subdf[['commentor','comment_time','comment_polarity',
                        'ranking','cleaned_comment','cleaned_title',
                        'comment_subjectivity']][0:9].to_json(orient='records')

Timer unit: 1e-06 s 

Total time: 92.7974 s for 1000 rows.
File: <ipython-input-89-b65c8bda2927>
Function: loopSentimentAggegator at line 3

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     3                                           def loopSentimentAggegator(i):
     4                                               # Select `subdf` for the selected commentor and sort
     5      1000    1451170.0   1451.2      1.6      subdf = df_small_loop[df_small_loop['commentor'].values == i]
     6      1000    3935143.0   3935.1      4.2      subdf = subdf.sort_values(by=['comment_polarity', 'comment_subjectivity'])
     7                                           
     8                                               # Initialize processing df `outdf`
     9      1000       2812.0      2.8      0.0      commentor = i
    10      1000    1259788.0   1259.8      1.4      outdf = pd.DataFrame([{ 'commentor': commentor }])
    11                                               
    12                                               # Comments: qty (int), first (unix time), last (unix time)
    13      1000    1124149.0   1124.1      1.2      outdf["comments_qty"] = len(subdf.index)
    14      1000    1521732.0   1521.7      1.6      outdf["comments_first"] = subdf.comment_time.max()
    15      1000    1395105.0   1395.1      1.5      outdf["comments_last"] = subdf.comment_time.min()

```




### The key is to Sort the data, then run the groupby operation.
#### Lesson Learned here: Never try to sort in a loop if you can avoid it.

In [None]:
def simpleSort(subdf):
    subdf = subdf.sort_values(by=['comment_polarity','comment_subjectivity'])
    return subdf

%lprun -f simpleSort z = simpleSort(ds5)

"""
Runs beautifully! 
Timer unit: 1e-06 s

Total time: 46.4984 s
File: <ipython-input-114-b21908e70c09>
Function: simpleSort at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def simpleSort(subdf):
     2         1   46498373.0 46498373.0    100.0      subdf = subdf.sort_values(by=['comment_polarity','comment_subjectivity'])
     3         1          2.0      2.0      0.0      return subdf

"""

First run
```python
%lprun output - simple sort 500,000 rows. 
Timer unit: 1e-06 s

Total time: 0.031263 s
File: <ipython-input-64-3e739ef6d38a>
Function: simpleSort at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def simpleSort(subdf):
     2         1      31261.0  31261.0    100.0      subdf = subdf.sort_values(by=['comment_polarity','comment_subjectivity'])
     3         1          2.0      2.0      0.0      return subdf
     
```

Second run
```python
Second run lprun output - simple sort. 1,000,000 rows.
Timer unit: 1e-06 s

Total time: 0.469898 s
File: <ipython-input-69-3e739ef6d38a>
Function: simpleSort at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def simpleSort(subdf):
     2         1     469896.0 469896.0    100.0      subdf = subdf.sort_values(by=['comment_polarity','comment_subjectivity'])
     3         1          2.0      2.0      0.0      return subdf

```

Final sorting run was last.
``` python
Final Run - Less than 1 second. 15,397,309 rows
Timer unit: 1e-06 s

Total time: 0.470333 s
File: <ipython-input-73-3e739ef6d38a>
Function: simpleSort at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def simpleSort(subdf):
     2         1     470331.0 470331.0    100.0      subdf = subdf.sort_values(by=['comment_polarity','comment_subjectivity'])
     3         1          2.0      2.0      0.0      return subdf
```

In [None]:
grouped_df = df.groupby(df.L)
grouped_df.groups.series

In [None]:
pd.Series({x : y.b.tolist() for x , y in df.groupby('a')})

In [None]:
df_smallA.shape()

In [None]:
def grabTen(subdf):
    subdf['comment_ids'] = subdf.groupby('commentor')['commentid'].head(10).to_json()
    return subdf

%lprun -f grabTen z = grabTen(df_smallC)

In [None]:
def grabAllPoints(subdf):
    # collect upto 4000 comment data points for plotting?
    subdf["polsub_points"] = subdf.groupby('commentor')[['comment_polarity','comment_subjectivity']].head(10000).to_json()
    return subdf                                            

%lprun -f grabAllPoints z = grabAllPoints(df_smallC)


In [None]:
def grabAllPoints(subdf):
    # collect upto 4000 comment data points for plotting?
    subdf["polsub_points"] = subdf.groupby('commentor')['comment_polarity','comment_subjectivity'].apply(lambda x)
    return subdf                                            

%lprun -f grabAllPoints z = grabAllPoints(df_smallC)


In [None]:
def sentiment_coord (df):
    df["polsub_points"] = df.to_json()
    return df

def grabAllPoints(df):
    df['polarity_points'] = df.groupby('commentor').comment_polarity.apply(pd.Series.tolist)
    return df

%lprun -f grabAllPoints y = grabAllPoints(df_smallC)

# Previous Code:

In [None]:
df_small_loop = ds5[0:10000]

def loopSentimentAggegator(i):
    # Select `subdf` for the selected commentor and sort
    subdf = df_small_loop[df_small_loop['commentor'].values == i]
    subdf = subdf.sort_values(by=['comment_polarity', 'comment_subjectivity'])

    # Initialize processing df `outdf`
    commentor = i
    outdf = pd.DataFrame([{ 'commentor': commentor }])
    
    # Comments: qty (int), first (unix time), last (unix time)
    outdf["comments_qty"] = len(subdf.index)
    outdf["comments_first"] = subdf.comment_time.max()
    outdf["comments_last"] = subdf.comment_time.min()

    # Filters
    p_pos = (subdf['comment_polarity'] >= 0.0)
    p_neg = (subdf['comment_polarity'] < 0.0)
    b_pos = (subdf['comment_subjectivity'] >= 0.0)
    b_neg = (subdf['comment_subjectivity'] < 0.0)
    
    q1_filter = (p_pos & b_pos)
    q2_filter = (p_pos & b_neg)
    q3_filter = (p_neg & b_pos)
    q4_filter = (p_neg & b_neg)
                                  
    
    """
    ####################################################################
    POLARITY: "Emotional sentiment"
    Is the sentiment of the written piece positive or negative?
    How many negative? Mean of negative? Total negativity?
    How many positive? Mean of positive? Total positivity?
    """
    # (ALL POLARITY): 
    outdf = outdf.join(subdf.comment_polarity.describe().rename({'25%': 'Q1','50%': 'Median','75%': 'Q3'}, axis='index')
                      .loc[['mean','std','min','max','median','Q1','Q3']]
                      .add_prefix('p_all_').to_frame().T.reset_index(drop=True), how='left')
    outdf["p_all_sum"] = subdf.comment_polarity.sum()
    
    # (NEGATIVE) 
    outdf["p_neg_count"] = subdf[p_neg].comment_polarity.count()
    outdf["p_neg_mean"] = subdf[p_neg].comment_polarity.mean()
    outdf["p_neg_sum"] = subdf[p_neg].comment_polarity.sum() 
    
    # (POSITIVE) 
    outdf["p_pos_count"] = subdf[p_pos].comment_polarity.count()
    outdf["p_pos_mean"] = subdf[p_pos].comment_polarity.mean()
    outdf["p_pos_sum"] = subdf[p_pos].comment_polarity.sum() 
    
    
    """
    ####################################################################
    BASIS: "Subjectivity" 
    Is the written perspective's basis subjective or objective?
    For SUBJECTIVE or OBJECTIVE, how many are there?
    
    """ 
    
    # (ALL BASIS)
    outdf = outdf.join(subdf.comment_subjectivity.describe().rename({'25%': 'Q1','50%': 'Median','75%': 'Q3'}, axis='index')
                      .loc[['mean','std','min','max','median','Q1','Q3']]
                      .add_prefix('b_all_').to_frame().T.reset_index(drop=True),how='left')
    outdf["b_all_sum"] = subdf.comment_subjectivity.sum()
    
    # (SUBJECTIVE) (Negative)
    #---->Total Salty Comments<------
    outdf["b_neg_count"] = subdf[b_neg].comment_subjectivity.count()
    outdf["b_neg_mean"] = subdf[b_neg].comment_subjectivity.mean()
    outdf["b_neg_sum"] = subdf[b_neg].comment_subjectivity.sum()
    
    # (OBJECTIVE) (Positive)
    outdf["b_pos_count"] = subdf[b_pos].comment_subjectivity.count()
    outdf["b_pos_mean"] = subdf[b_pos].comment_subjectivity.mean()
    outdf["b_pos_sum"] = subdf[b_pos].comment_subjectivity.sum()
    
    """
    ####################################################################
    BASIS VS POLARITY: CrossTab. 
    Are certain parings of Polarity & Basis more common? 
    """
    """
    p_pos = (subdf['comment_polarity'] >= 0.0)
    p_neg = (subdf['comment_polarity'] < 0.0)
    b_pos = (subdf['comment_subjectivity'] >= 0.0)
    b_neg = (subdf['comment_subjectivity'] < 0.0)
                                  

    #### 24 features
    # Q1: POLARITY(+)/BASIS(+) - 'Positive and Objective'  (The Builder)
    outdf["Q1_pp_p_count"] = subdf[q2_filter].comment_polarity.count()
    outdf["Q1_pp_p_mean"] = subdf[q2_filter].comment_polarity.mean()
    outdf["Q1_pp_p_sum"] = subdf[q2_filter].comment_polarity.sum() 
    
    outdf["Q1_pp_b_count"] = subdf[q2_filter].comment_subjectivity.count()
    outdf["Q1_pp_b_mean"] = subdf[q2_filter].comment_subjectivity.mean()
    outdf["Q1_pp_b_sum"] = subdf[q2_filter].comment_subjectivity.sum()
    
    
    # Q2: POLARITY(+)/BASIS(-) - 'Positive and Subjective' (The Feeler)
    outdf["Q2_pn_p_count"] = subdf[q2_filter].comment_polarity.count()
    outdf["Q2_pn_p_mean"] = subdf[q2_filter].comment_polarity.mean()
    outdf["Q2_pn_p_sum"] = subdf[q2_filter].comment_polarity.sum() 
    
    outdf["Q2_pn_p_count"] = subdf[q2_filter].comment_subjectivity.count()
    outdf["Q2_pn_p_mean"] = subdf[q2_filter].comment_subjectivity.mean()
    outdf["Q2_pn_p_sum"] = subdf[q2_filter].comment_subjectivity.sum()
    
    
    # Q3: POLARITY(-)/BASIS(+) - 'Negative and Objective'  (The Critic)
    outdf["Q3_np_p_count"] = subdf[q3_filter].comment_polarity.count()
    outdf["Q3_np_p_mean"] = subdf[q3_filter].comment_polarity.mean()
    outdf["Q3_np_p_sum"] = subdf[q3_filter].comment_polarity.sum() 
    
    outdf["Q3_np_b_count"] = subdf[q3_filter].comment_subjectivity.count()
    outdf["Q3_np_b_mean"] = subdf[q3_filter].comment_subjectivity.mean()
    outdf["Q3_np_b_sum"] = subdf[q3_filter].comment_subjectivity.sum()
    
    
    # Q4: POLARITY(-)/BASIS(-) - 'Negative and Subjective' (The Salty)
    outdf["Q4_nn_p_count"] = subdf[q4_filter].comment_polarity.count()
    outdf["Q4_nn_p_mean"] = subdf[q4_filter].comment_polarity.mean()
    outdf["Q4_nn_p_sum"] = subdf[q4_filter].comment_polarity.sum() 
    
    outdf["Q4_nn_b_count"] = subdf[q4_filter].comment_subjectivity.count()
    outdf["Q4_nn_b_mean"] = subdf[q4_filter].comment_subjectivity.mean()
    outdf["Q4_nn_b_sum"] = subdf[q4_filter].comment_subjectivity.sum()
""";

    # Upvote Ranking Metrics -  Mean & Count & Sum
    outdf = outdf.join(subdf.ranking.describe()
                      .loc[['count','max','mean']]
                      .add_prefix('upvotes_')
                      .to_frame().T.reset_index(drop=True),how='left')      
    outdf['upvotes_sum'] = subdf.ranking.sum()

    #GROUPED_OBJECTS#################################################################### 
    # All of Commentor's Polarity / Subjectivity points in list for plotting.
    outdf["polsub_points"] = subdf[['comment_polarity','comment_subjectivity']].to_json()  
    
    # Create the second output, a list of the commentor's saltiest comments. 
    outdf['salty_comments'] = subdf[['commentor','comment_time','comment_polarity',
                            'ranking','cleaned_comment','parent_title',
                            'comment_subjectivity']][0:9].to_json(orient='records')
    
    # Ten most positive comments
    #    outdf['sweet_comments'] = subdf[['commentor','comment_time','comment_polarity',
    #                            'ranking','cleaned_comment','parent_title',
    #                            'comment_subjectivity']].tail(10).to_json(orient='records')
    

    
    outputDF = outdf
    return outputDF


### Test function with single commentor to ensure output is good. 

In [None]:
%time
commentorList = ds5.commentor.unique().tolist()
print("There are this many unique commentors:", len(commentorList))
c_list = pd.DataFrame(commentorList)
c_list.columns = ['commentor']
display(c_list.head())

In [None]:
%lprun -f loopSentimentAggegator testingDF = loopSentimentAggegator('eli')
display(testingDF.head())

### Dont use homebrewed aggregation functions by commentor to entire dataframe.

In [None]:
results1 = []

%lprun -f loopSentimentAggegator c_list[0:1000].commentor.progress_apply(lambda x: results1.append(loopSentimentAggegator(x)))

final_testing= pd.concat(results1)
final_testing.head(3)

In [None]:
%%timeit
d.groupby(
 'x'
 ).head(
 K
 ).reset_index(drop=True)

### Concatenate aggregation outputs (list of dfs) into a single final dataframe.

In [None]:
finalTableResults = pd.concat(results)
print(finalTableResults.shape)
display(finalTableResults.head())

### Save final results to CSV.

In [None]:
finalTableResults.to_csv('data/hn_commentor_data.csv',index=False)

### Save final results to AVRO (just to be safe) :). 

In [None]:
pdx.to_avro('data/hn_commentor_data.avro', finalTableResults)

### Check lengths & tail to make sure it looks right. 

In [None]:
csvsaved = df.read_csv('data/hn_commentor_data.csv')
avrosaved = pdx.read_avro('data/hn_commentor_data.avro')

In [None]:
print('Saved csv shape:', csvsaved.shape)
print('Saved avro shape:', avrosaved.shape)

In [None]:
display(avrosaved.tail(3))
display(csvsaved.tail(3))

# SUCCESS!  Now just need to get it into a AWS RDS PostgreSQL instance. : )

#### I feel like this should have worked... but it didn't. Any insights as to why it failed? 
``` python
from dask.distributed import Client, LocalCluster
client = Client()  # This is actually the following two commands
cluster = LocalCluster()

%%time
# Trying this with dask
ds = dd.from_pandas(ds2.cleaned_comment, npartitions=1000)
res = ds.apply(lambda x: get_sentiment(x), meta={'z':'str'})
res.compute(scheduler='threads', num_workers=8)```

### Bonus Material: The Graveyard - Ideas that didn't work.

* This didn't work because df.to_sql() is sloooooooow. Just send the CSV straight to PostgreSQL. 

``` python
def verify_output(pgres_engine, table_name):
    # ______  verify output-table contents ____
    query = 'SELECT * FROM ' + table_name + ' LIMIT 10;'
    for row in pgres_engine.execute(query).fetchall():
        print(row)
    return

def run_conversion(pgres_engine):
    # ___ process tables ____
    df = pdx.read_avro('data/hn_commentors_db.avro')
    schema_name = 'lambdaRPG'
    tables = ['commentor_data']
    df.to_sql(table_name,
              if_exists='replace',
              con=pgres_engine,
              schema=schema_name,
              chunksize=10)
    verify_output(pgres_engine, table_name)
    return

def runARVOtoSQL():
    # __ Connect to postgres (SQLalchemy.engine) ____
    dbname = ''
    user = ''
    host = ''
    password = ''
    file = open('aws.pwd', 'r')
    ctr = 1
    for line in file:
        line = line.replace('\n', '')
        if ctr == 1: dbname = line
        if ctr == 2: user = line
        if ctr == 3: host = line
        if ctr == 4: passw = line
        ctr = ctr + 1
    pgres_str = 'postgresql+psycopg2://'+user+':'+passw+'@'+host+'/'+dbname
    pgres_engine = create_engine(pgres_str)
    run_conversion(pgres_engine)
    print('Conversion successful.....')
    return

```

* This didn't work because I need to learn more dask. 
``` python
ds3['comment_sentiment_dask'] = ds3['cleaned_comment'].apply(lambda x: get_sentiment(x)).compute(scheduler='threads')```



* This also didn't work. Same reason.
``` python
dsr3 = ds2
dsr3['cleaned_comment'] = dsr3.text.apply(lambda x: noURLS(noHTML(encode_decode(x)))).compute()```

* This dask layout worked for a few parts but didn't want to thread. And it kept failing because of a deep error. 

``` python
dsr3 = dd.from_pandas(ds2, npartitions=2000)

finalDF = dsr2
def fin (daskDataframe):
    daskDataframe['comment_sentiment'] = daskDataframe.text.apply(lambda x: get_sentiment(noURLS(noHTML(encode_decode(x)))))
    daskDataframe['cleaned_comment'] = daskDataframe.text.apply(lambda x: noURLS(noHTML(encode_decode(x))))
    return finalDF

with ProgressBar():
    res = fin(dsr2).compute()``` 

* This is helpful. 

``` console
Where to find the dask distributed Bokeh dashboard on aws. 

URL of accessing Dask Dashboard will be:
https://myinstance.notebook.us-east-1.sagemaker.aws/proxy/8787/```

# Thanks for reading! 


In [None]:
# LOTS OF ROOM FOR IMPROVEMENT I GUESS.

#len(commentorList)+1
results = []
for j in tqdm(commentorList[0:100]):
    newDF = loopSentimentAggegator(j)
    results.append(newDF)