In [0]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'jsmazorra/JSON API Authorization/'

In [0]:
from google.cloud import bigquery
import time
import pandas as pd
import numpy as np

In [0]:
# Setup client
bq_client = bigquery.Client()

In [0]:
# GETS SQL QUERY FOR 100 USERS (14,000-14,000) FROM SORTED TABLE OF MOST COMMENTS
# WHERE TOTAL COMMENTS IS SLIGHTLY ABOVE 100

QUERY = '''
        SELECT E.*, C.comments_count
        FROM `bigquery-public-data.hacker_news.full` as E
        JOIN(
            SELECT *
            FROM(
                SELECT *
                FROM(
                    SELECT  `bigquery-public-data.hacker_news.full`.by, COUNT(*) as comments_count
                    FROM `bigquery-public-data.hacker_news.full`
                    GROUP BY `bigquery-public-data.hacker_news.full`.by
                )  
                ORDER BY comments_count DESC
                LIMIT 9500
            )
            ORDER BY comments_count 
            LIMIT 100            
        ) C 
        ON E.by = C.by
        '''

In [44]:
import time
start = time.time()
df = bq_client.query(QUERY).to_dataframe()
end = time.time()
print(end - start)

31.51998257637024


In [45]:
df.shape

(44699, 15)

In [46]:
users = df['by'].unique()
len(users)

100

In [47]:
df.head()

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted,comments_count
0,,,"This guy is doing good work, and I fear for hi...",,b6,,1367666824,2013-05-04 11:27:04+00:00,comment,5654717,5654665.0,,,,446
1,,,I think I&#x27;ve shown your original contenti...,,b6,,1513192846,2017-12-13 19:20:46+00:00,comment,15916726,15916055.0,,,,446
2,,,"&gt; Your end goal might be laudable, but the ...",,b6,,1454803736,2016-02-07 00:08:56+00:00,comment,11050754,11050665.0,,,,446
3,,,&gt; How do you move to Nebraska when you have...,,b6,,1513245886,2017-12-14 10:04:46+00:00,comment,15921141,15920864.0,,,,446
4,,,Is inciting people to violence really anything...,,b6,,1515934575,2018-01-14 12:56:15+00:00,comment,16144063,16144045.0,,,,446


In [0]:
# Drop unecessary columns
df_drop = df.drop(columns= ['score','title','url','deleted','dead','descendants','ranking'])

# Keep only comments in the dataframe
df_drop = df_drop[df_drop['type']=='comment']

# Drop empty comments 
df_drop = df_drop[(df_drop['text'] != '')&(df_drop['text'] != np.nan)]

# Fix weird text encodings
import html
df_drop['text']=df_drop['text'].apply(str)
df_drop['text'] = df_drop['text'].apply(lambda x: html.unescape(x))

# Remove html tags from string
import re
def remove_html_tags(text):
   clean = re.compile('<.*?>')
   return re.sub(clean, '', text)
df_drop['text'] = df_drop['text'].apply(lambda x: remove_html_tags(x))

# Convert unix time to datetime object with date
from datetime import datetime
df_drop['time']=pd.to_datetime(df_drop['time'],unit='s') # for accuracy secs

In [49]:
# Check only analyzing comments
df_drop['type'].value_counts()

comment    39064
Name: type, dtype: int64

In [50]:
users = df_drop['by'].unique()
print(len(users))

99


In [51]:
df_drop.head()

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count
0,"This guy is doing good work, and I fear for hi...",b6,2013-05-04 11:27:04,2013-05-04 11:27:04+00:00,comment,5654717,5654665.0,446
1,I think I've shown your original contention is...,b6,2017-12-13 19:20:46,2017-12-13 19:20:46+00:00,comment,15916726,15916055.0,446
2,"> Your end goal might be laudable, but the pat...",b6,2016-02-07 00:08:56,2016-02-07 00:08:56+00:00,comment,11050754,11050665.0,446
3,> How do you move to Nebraska when you have $1...,b6,2017-12-14 10:04:46,2017-12-14 10:04:46+00:00,comment,15921141,15920864.0,446
4,Is inciting people to violence really anything...,b6,2018-01-14 12:56:15,2018-01-14 12:56:15+00:00,comment,16144063,16144045.0,446


In [19]:
pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 15.4MB/s eta 0:00:01[K     |█████▏                          | 20kB 1.7MB/s eta 0:00:01[K     |███████▉                        | 30kB 2.3MB/s eta 0:00:01[K     |██████████▍                     | 40kB 2.6MB/s eta 0:00:01[K     |█████████████                   | 51kB 2.0MB/s eta 0:00:01[K     |███████████████▋                | 61kB 2.3MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 2.9MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 2.8MB/s eta 0:00:01[K     |███████████████████████████████▏| 12

In [0]:
# Populate sentiment analysis columns in dataframe
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [0]:
df_drop['neg']=np.zeros(df_drop.shape[0])
df_drop['pos']=np.zeros(df_drop.shape[0])
df_drop['neu']=np.zeros(df_drop.shape[0])

In [54]:
now=time.time()
df_drop['neg'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neg'])
df_drop['pos'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['pos'])
df_drop['neu'] = df_drop['text'].apply(lambda x: sia.polarity_scores(x)['neu'])
print('sentiment cal took:',round(time.time()-now,2),'s')

sentiment cal took: 80.61 s


In [55]:
df_drop.head()

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
0,"This guy is doing good work, and I fear for hi...",b6,2013-05-04 11:27:04,2013-05-04 11:27:04+00:00,comment,5654717,5654665.0,446,0.179,0.318,0.503
1,I think I've shown your original contention is...,b6,2017-12-13 19:20:46,2017-12-13 19:20:46+00:00,comment,15916726,15916055.0,446,0.113,0.084,0.803
2,"> Your end goal might be laudable, but the pat...",b6,2016-02-07 00:08:56,2016-02-07 00:08:56+00:00,comment,11050754,11050665.0,446,0.0,0.156,0.844
3,> How do you move to Nebraska when you have $1...,b6,2017-12-14 10:04:46,2017-12-14 10:04:46+00:00,comment,15921141,15920864.0,446,0.128,0.144,0.727
4,Is inciting people to violence really anything...,b6,2018-01-14 12:56:15,2018-01-14 12:56:15+00:00,comment,16144063,16144045.0,446,0.162,0.107,0.731


In [56]:
df_final = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_pos = pd.DataFrame(data=None, columns=df_drop.columns)
df_user_most_neg = pd.DataFrame(data=None, columns=df_drop.columns)

for user in users:
    # Create a dataframe of only one user
    df_user = df_drop[df_drop['by'] == user]
    
    # Remove pure sentiments (removes pure one-word comments)
    df_user_unpure = df_user[(df_user['neg'] !=1) & (df_user['pos'] !=1)]
    
    # Submit first 100 comments of the unpure sentiments to final dataframe
    df_final = df_final.append(df_user_unpure[:100])
    
    # Submit max unpure neg/pos comment per user into dataframe
    df_temp = df_final[df_final['by'] == user]
    ix_neg = df_temp['neg'].idxmax(axis=0)
    df_user_most_neg = df_user_most_neg.append(df_temp.loc[ix_neg])
    ix_pos = df_temp['pos'].idxmax(axis=0)
    df_user_most_pos = df_user_most_pos.append(df_temp.loc[ix_pos])
    
df_final.shape

(9679, 11)

In [57]:
users = df_drop['by'].unique()
len(users)

99

In [58]:
df_final.head(10)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
0,"This guy is doing good work, and I fear for hi...",b6,2013-05-04 11:27:04,2013-05-04 11:27:04+00:00,comment,5654717,5654665.0,446,0.179,0.318,0.503
1,I think I've shown your original contention is...,b6,2017-12-13 19:20:46,2017-12-13 19:20:46+00:00,comment,15916726,15916055.0,446,0.113,0.084,0.803
2,"> Your end goal might be laudable, but the pat...",b6,2016-02-07 00:08:56,2016-02-07 00:08:56+00:00,comment,11050754,11050665.0,446,0.0,0.156,0.844
3,> How do you move to Nebraska when you have $1...,b6,2017-12-14 10:04:46,2017-12-14 10:04:46+00:00,comment,15921141,15920864.0,446,0.128,0.144,0.727
4,Is inciting people to violence really anything...,b6,2018-01-14 12:56:15,2018-01-14 12:56:15+00:00,comment,16144063,16144045.0,446,0.162,0.107,0.731
5,"Well said. I'm an American in China, and I fin...",b6,2013-03-22 11:33:31,2013-03-22 11:33:31+00:00,comment,5422320,5422252.0,446,0.094,0.136,0.77
6,Awesome! This is excellent.hlint really helped...,b6,2015-01-17 11:23:04,2015-01-17 11:23:04+00:00,comment,8904104,8903990.0,446,0.079,0.225,0.696
7,Is the article a bit strange? They say Liu Zhi...,b6,2017-01-16 12:25:38,2017-01-16 12:25:38+00:00,comment,13409589,13409082.0,446,0.065,0.09,0.845
8,I don't think the problem is that we're too st...,b6,2017-12-20 15:23:05,2017-12-20 15:23:05+00:00,comment,15970282,15970162.0,446,0.161,0.11,0.729
9,I don't know. How do I find out for sure?,b6,2018-07-11 09:32:53,2018-07-11 09:32:53+00:00,comment,17505582,17505509.0,446,0.0,0.204,0.796


In [59]:
# Random comments from the 100 users
df_random100 = df_final.sample(100)
df_random100.head(10)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
44266,Yep. I was stuck in that rut for years. But th...,tenaciousDaniel,2020-03-08 21:59:49,2020-03-08 21:59:49+00:00,comment,22520906,22517879.0,447,0.04,0.158,0.802
34448,"I like the name, this sounds similar to (or in...",mellosouls,2019-07-28 10:31:07,2019-07-28 10:31:07+00:00,comment,20546798,20546356.0,447,0.0,0.067,0.933
41597,> I may have a 7/11 near me which doesn’t even...,2bitencryption,2018-07-14 18:06:29,2018-07-14 18:06:29+00:00,comment,17531487,17531267.0,448,0.148,0.095,0.757
25048,"Basically, yes. Classical stoicism applied to ...",secstate,2017-03-27 02:59:34,2017-03-27 02:59:34+00:00,comment,13964803,13962282.0,449,0.07,0.12,0.81
30442,Wasn't this raised a year or so ago? Vehement ...,socceroos,2015-07-08 06:32:02,2015-07-08 06:32:02+00:00,comment,9850112,9849942.0,447,0.174,0.0,0.826
4561,600usd is a bit steep- no disrespect to the de...,fxfan,2019-01-21 05:19:25,2019-01-21 05:19:25+00:00,comment,18957843,18957716.0,446,0.0,0.206,0.794
34499,Transcript:\nhttps://www.thisamericanlife.org/...,mellosouls,2019-12-29 03:43:10,2019-12-29 03:43:10+00:00,comment,21903808,21903801.0,447,0.0,0.0,1.0
19224,Going to shamelessly post a Wireshark tutorial...,rosstex,2018-06-19 14:20:22,2018-06-19 14:20:22+00:00,comment,17346886,17344342.0,445,0.057,0.225,0.718
41213,> (not that the evidence is terribly substanti...,alex_hitchins,2015-02-05 13:25:22,2015-02-05 13:25:22+00:00,comment,9003080,9003010.0,448,0.207,0.0,0.793
34040,"If people are really worried, they shouldn't b...",jzelinskie,2013-09-10 07:12:49,2013-09-10 07:12:49+00:00,comment,6358630,6358550.0,448,0.081,0.078,0.842


In [60]:
# Most negative comments
df_neg100 = df_final.sort_values(['neg'],ascending=False)[:100]
df_neg100.head(20)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
24655,Bad: http://www.arngren.net,roschdal,2017-03-06 22:10:05,2017-03-06 22:10:05+00:00,comment,13806503,13798579.0,446,0.778,0.0,0.222
36238,April Fools?,spencerfry,2011-04-01 03:36:46,2011-04-01 03:36:46+00:00,comment,2394824,2392525.0,447,0.762,0.0,0.238
15265,Unidentified fakes,bobsil1,2019-07-05 17:11:44,2019-07-05 17:11:44+00:00,comment,20364087,20362745.0,448,0.737,0.0,0.263
24640,Boycotting Cisco.,roschdal,2011-07-21 22:09:54,2011-07-21 22:09:54+00:00,comment,2791514,2789540.0,446,0.73,0.0,0.27
16160,Why are gun companies evil?,drharby,2018-03-20 17:53:05,2018-03-20 17:53:05+00:00,comment,16630891,16630718.0,448,0.694,0.0,0.306
19319,https://puu.sh/rikNL/c0ecb8bcff.pngoutlook unc...,rosstex,2016-09-21 00:04:26,2016-09-21 00:04:26+00:00,comment,12544412,12541081.0,445,0.667,0.0,0.333
30435,"Yikes, how horrifying",socceroos,2019-04-27 00:00:07,2019-04-27 00:00:07+00:00,comment,19762794,19762159.0,447,0.649,0.0,0.351
21993,Very sad indeed.,istvan__,2015-06-03 18:43:12,2015-06-03 18:43:12+00:00,comment,9654848,9654710.0,445,0.629,0.0,0.371
30402,Australia's darkest hour.,socceroos,2014-11-10 02:51:26,2014-11-10 02:51:26+00:00,comment,8581821,8580642.0,447,0.615,0.0,0.385
43436,Bad joke...,mbenjaminsmith,2009-08-05 11:16:09,2009-08-05 11:16:09+00:00,comment,743176,714923.0,447,0.614,0.386,0.0


In [63]:
# Most positive comments
df_pos100 = df_final.sort_values(['pos'],ascending=False)[:100]
df_pos100.head(20)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
6288,Thanks :) I agree ;),kentf,2010-07-14 22:10:25,2010-07-14 22:10:25+00:00,comment,1516124,1515953.0,446,0.0,0.912,0.088
6451,haha wow... so true,kentf,2009-08-18 18:56:14,2009-08-18 18:56:14+00:00,comment,771174,771122.0,446,0.0,0.908,0.092
6270,"Beautiful, thanks for sharing.",kentf,2014-10-24 12:50:20,2014-10-24 12:50:20+00:00,comment,8503377,8503324.0,446,0.0,0.906,0.094
25998,glad to help! :),zbruhnke,2014-02-16 05:16:36,2014-02-16 05:16:36+00:00,comment,7246736,7246313.0,445,0.0,0.9,0.1
39882,"brilliant, love it.",dalek2point3,2014-03-25 18:27:38,2014-03-25 18:27:38+00:00,comment,7467781,7467527.0,448,0.0,0.889,0.111
24677,Thanks! Very useful.,roschdal,2017-01-26 16:59:32,2017-01-26 16:59:32+00:00,comment,13493022,13492707.0,446,0.0,0.865,0.135
43360,That's pretty cool...,mbenjaminsmith,2009-08-05 11:15:10,2009-08-05 11:15:10+00:00,comment,743175,741062.0,447,0.0,0.846,0.154
15246,Congrats and best of luck!,bobsil1,2012-04-16 23:25:51,2012-04-16 23:25:51+00:00,comment,3850056,3850043.0,448,0.0,0.845,0.155
9908,Interesting = value,jpcx01,2009-07-17 00:54:50,2009-07-17 00:54:50+00:00,comment,709092,709071.0,445,0.0,0.836,0.164
6273,Brian FTW!,kentf,2014-10-05 16:27:37,2014-10-05 16:27:37+00:00,comment,8412650,8411638.0,446,0.0,0.801,0.199


In [64]:
# Most single negative comment for each user
df_user_most_neg.head(10)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
47,The Chinese government would no more commit wi...,b6,2013-12-16 03:04:19,2013-12-16 03:04:19+00:00,comment,6912524,6912240.0,446,0.478,0.0,0.522
505,"Thanks, wiki continues to provide detailed inf...",brg,2011-03-12 10:45:05,2011-03-12 10:45:05+00:00,comment,2316575,2316556.0,448,0.307,0.116,0.578
907,But I doubt the vast majority of Farmville and...,hop,2009-11-01 16:45:03,2009-11-01 16:45:03+00:00,comment,915213,914778.0,447,0.404,0.0,0.596
1381,Are assault rifles licensed separately?,3825,2012-12-25 00:55:12,2012-12-25 00:55:12+00:00,comment,4964668,4964599.0,447,0.487,0.0,0.513
1883,dupe : https://news.ycombinator.com/item?id=12...,alva,2016-10-27 16:39:07,2016-10-27 16:39:07+00:00,comment,12806657,12806533.0,445,0.556,0.0,0.444
2270,I think one thing to consider is the attacker'...,Dwolb,2015-07-24 17:55:13,2015-07-24 17:55:13+00:00,comment,9943874,9943751.0,449,0.31,0.0,0.69
2788,This is clearly insane.,anm89,2019-11-27 04:48:58,2019-11-27 04:48:58+00:00,comment,21645330,21642022.0,446,0.365,0.365,0.27
3140,Indeed. Assholes we need. But we got the freed...,cjsuk,2017-11-26 12:51:36,2017-11-26 12:51:36+00:00,comment,15780684,15780604.0,446,0.478,0.0,0.522
3652,> Basically any system which prosecutes crime ...,davej,2015-03-04 01:08:05,2015-03-04 01:08:05+00:00,comment,9142063,9126680.0,446,0.378,0.03,0.592
4156,No pun intended?,fezzl,2011-01-17 10:18:51,2011-01-17 10:18:51+00:00,comment,2111764,2108699.0,447,0.524,0.0,0.476


In [65]:
# Most single positive comment for each user
df_user_most_pos.head(10)

Unnamed: 0,text,by,time,timestamp,type,id,parent,comments_count,neg,pos,neu
36,Please be very careful. <3.,b6,2013-02-19 04:37:11,2013-02-19 04:37:11+00:00,comment,5242639,5242598.0,446,0.0,0.583,0.417
472,"If you enjoyed those lectures, the EconTalk co...",brg,2015-02-19 17:11:29,2015-02-19 17:11:29+00:00,comment,9075212,9074009.0,448,0.0,0.345,0.655
902,"Thanks for posting this, support has been over...",hop,2011-12-14 08:18:32,2011-12-14 08:18:32+00:00,comment,3351088,3351070.0,447,0.0,0.501,0.499
1375,I would not worry too much about people miscon...,3825,2013-04-15 12:18:15,2013-04-15 12:18:15+00:00,comment,5551012,5551000.0,447,0.0,0.373,0.627
1849,Fantastic project. Well done.,alva,2018-04-04 15:54:00,2018-04-04 15:54:00+00:00,comment,16755904,16746035.0,445,0.0,0.74,0.26
2298,Cool that you agree. Are your projects structu...,Dwolb,2014-04-21 01:32:25,2014-04-21 01:32:25+00:00,comment,7619250,7618439.0,449,0.0,0.407,0.593
2765,This is an excellent analogy.,anm89,2019-04-05 22:45:13,2019-04-05 22:45:13+00:00,comment,19587732,19587497.0,446,0.0,0.481,0.519
3202,Good work - thank you :),cjsuk,2017-10-09 16:53:32,2017-10-09 16:53:32+00:00,comment,15435459,15434977.0,446,0.0,0.737,0.263
3671,"""Excellence Has Become a Habit""… nice snippet ...",davej,2011-01-18 23:18:06,2011-01-18 23:18:06+00:00,comment,2117849,2117541.0,446,0.0,0.463,0.537
4159,Facebook certainly.,fezzl,2010-12-30 08:58:22,2010-12-30 08:58:22+00:00,comment,2051763,2032443.0,447,0.0,0.706,0.294
