In [None]:
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
import nltk
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag, word_tokenize
from sklearn.naive_bayes import MultinomialNB
import requests
import datetime, time
import h5py

import re 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

pd.options.display.max_columns = 90
pd.options.display.max_rows = 200

## Defining Project's Exclusion Rules, Stop Words and Explanation

This section works through step by step all sub-sets of Stop Words to exclude during pre-processing of both subreddit data.

Words deemed unimportant to the meaning of posts or to differentiation between the two subreddits will be filtered out during pre-processing. Regex will also be used during final dataframe creation to filter out other stop words. General rules and systemization for stop words is notated below: 

1. All words with Word Vectorize counts less than 5 will be dropped. This is before lemmatization, due to the processing time constraints for the long tail of words and given the size of the data sets. After dropping these extremely rare words, the entire corpus will be lemmatized and recombined. Given words might have 2, or maybe 3 or 4 forms of the same root word so the absolute upper desired threshold is really 16 occurences, 4 occurences in a max of 4 different forms should be dropped. TF-IDF Vectorize will be tested, theorized to be most useful for proper nouns, as well as part of speech analysis using word_tokenize.

2. All numbers are dropped. Some like dates have good meaning but not in terms of differentiating between most all subreddit topics.
3. All special HTML character entities are dropped (ex: &nbsp, &amp etc), non-alphabet characters are dropped and all punctuation is dropped. Before processing this, all apostrophized/contraction words are dropped, since these basically all fall in the connecting word category.
4. All single letter words are dropped, many will be created from dropping punctuation
5. All two letter words are dropped. Almost all are connecting words which fall into or are functionally similar to the English Stop Word set. Many in the project data may be abbreviations with some value in keeping, but without surrounding word context it seems impossible to derive their meaning. The ones in the dictionary are below, few are colloquially meaningful and none seem of any use in classifying non-niche reddit topics (excluding maybe music):

| Two Letter Words in English Dictionary                       |
| ------------------------------------------------------------ |
| AA, AB, AD, AE, AG, AH, AI, AL, AM, AN, AR, AS, AT, AW, AX, AY |
| BA, BE, BI, BO, BY, CH, DA, DE, DI, DO                       |
| EA, ED, EE, EF, EH, EL, EM, EN, ER, ES, ET, EW, EX           |
| FA, FE, FY, GI, GO, GU, HA, HE, HI, HM, HO                   |
| ID, IF, IN, IO, IS, IT, JA, JO                               |
| KA, KI, KO, KY, LA, LI, LO                                   |
| MA, ME, MI, MM, MO, MU, MY, NA, NE, NO, NU, NY               |
| OB, OD, OE, OF, OH, OI,OK, OM, ON, OO, OP, OR, OS, OU, OW, OX, OY |
| PA, PE, PI, PO, QI                                           |
| RE, SH, SI, SO, ST, TA, TE, TI, TO                           |
| UG, UH, UM, UN, UP, UR, US, UT                               |
| WE, WO, XI, XU, YA, YE, YO, YU, ZA                           |

6. All lemmatized roots of words of the SKLearn English Stop Word set will be dropped.
7. All post URLs will be dropped. Initially, hope was to keep keywords but it is too hard to differentiate keywords from all others in url and there's too much junk and also undescriptive names. Only the main post attribute url will be split, due to a significant group of posts only having a title and a link to a main information webpage.
7. 

In [4]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction import stop_words


lem = WordNetLemmatizer()
excludedWords = set([lem.lemmatize(word) for word in stop_words.ENGLISH_STOP_WORDS])

# Using set functionality in case add same word twice to excludedWords and adds runtime efficiency
myStopWords = {'com','deleted'}
# redd probably comes from reddit truncated during processing, wouldn't differentiate subreddits
# Actually, won't stop word redd since one subreddit might refer to reddit more often

# Could drop rare frequency un-interpretable non-words
# possible: ezbatteriesreconditioning (15) seems like ad/spam, ambrosusamb (15) but could be users
# ~200 lowest frequency of 15 words, looks like that min_df cut out most complete non-words

excludedWords = excludedWords.union(myStopWords)

In [None]:
def printname(name) :
   print(name)

def list_all_hdf5(file) :
    with h5py.File(file, 'r') as f:
       f.visit(printname)

In [7]:
def decodeText(matchobj) :  # excludes all 2 letter 'words' and url specific keywords
    if type(matchobj) == str : text = matchobj
    else : text = matchobj.group(0)
    text = re.sub(r'\d+\w+\d+|\d+',' ',text)
    match = re.match(r'\W(\w{1,2})\W', text)
    if match : return ' '
    if re.match(r"http", text) :
        urltypetext = r'https{0,1}://|www\.|comments|reddit|\.com/r|\.com|/CryptoCurrency|/environment|html'
        punct = r'|-|\.|\$|@|\?|\%|\:|\;|\(|\)|\=|\[|\]|\{|\}|,'
        return re.sub(urltypetext + punct, ' ', text)        
    return text

In [175]:
testurl = 'https://www.reddit.com/r/CryptoCurrency/comments/7menr1/myriad_seems_to_be_forming_an_inverse_head_and/'
testurl = 'https://blog.zerion.io/defi-and-trading-assets-decentralized-exchanges-26543eda7c](https://blog.zerion.io/defi-and-trading-assets-decentralized-exchanges-26543eda7c)'
decodeText(testurl)

' //blog zerion io/defi and trading assets decentralized exchanges  c]( //blog zerion io/defi and trading assets decentralized exchanges  c)'

In [176]:
def splitText(text) :
    text = re.sub(r'\W\w{1,2}\W|http\S+',decodeText, text)
    midtext = re.sub(r'_|\W\w{1,2}\W|\d+|&\w+|\#\w+|\w+\'\w',' ', text)  
    # filters out "'s" and other apostrophe words, 1st pass of less than 3 char words
    wordlist = re.findall(r'(\w{3,})', midtext)  
    # findAll drops all words that are 2 or fewer characters as well as all punctuation
    return wordlist

In [60]:
testtext = '''A big obstacle for decentralization of trade marketplaces is how to deal with order 
books. They are lists where traders post buy and sell requests.\r\n\r\nIndividual users and especially 
market makers\u200a—\u200atraders who bring liquidity to markets and put buy and sell orders for many 
different assets or tokens\u200a—\u200aadd or remove orders constantly. The sheer amount of trading 
orders is too much for an on-chain solution to handle.\r\n\r\nThe first intermediary solution is 
Bancor’s smart token, which operates the exchange itself.\r\n\r\n&gt;*Bancor not only eliminates the 
third-party broker, but also the exchange trading\xa0partner.*\r\n\r\nTo exchange a token, a user 
only needs to interact with a smart contract that holds tokens in reserve, and so trades do not need 
to rely on the presence of other offers.\r\n\r\nHowever, the Bancor protocol encodes and influences 
the price discovery algorithmically within its own tokens, which limits liquidity and market 
efficiency.  \r\n[https://blog.zerion.io/defi-and-trading-assets-decentralized-exchanges-26543eda7c]
(https://blog.zerion.io/defi-and-trading-assets-decentralized-exchanges-26543eda7c)'''

In [55]:
testtext2 = """[Learn about NIX lpos](https://medium.com/@nixplatform/nixs-leasing-proof-of-stake-consensus-837fe083de4f)

&amp;#x200B;

Leasing proof of stake is now active on the NIX Platform network. With LPoS you will be able to lease your coins to merchants or other third parties so they can stake on your behalf and claim a percentage reward you allow in the initial contract. For example, party A leases 500 nix to party B allowing a 5% fee reward. Party B can now stake those 500 coins, when party B hits a stake, they are allowed to take 5% of the stake reward leaving party A to now own 500 + stake reward - 5% of stake reward. Party B can continue to stake those coins as long as the leasing contract is active. Party A can void that contract whenever he or she wants.  And to be clear, Party A can make a leasing contract that takes 0% fee, this can be used as a simple cold staking script if he or she wants to cold stake via their own computer. 

So this is fully trustless cold staking. You can keep your coins locked away on soon to be ledger and/or trezor while leasing them to yourself or any merchant with 0 risk. You control the leasing contract and can stop it at anytime. Whoever you lease the coins too can only stake them. You coins are stored offline safely, while also allowing you to earn stake rewards by leasing them in a fully trustless manner. 
"""

In [61]:
testtext3 = """ Today we get underway with arguably one of the most prominent and respected events in
the crypto space — **Token Summit**, this time from Silicon Valley.  Bluenote is on hand to present 
our vision of a game-changing solution for combating climate change with blockchain — and we’ll be 
updating this post with content, pictures and quotes from throughout the day!  Don’t miss out!  
&amp;#x200B;  *Processing img 24b4biwbav421...*     Just about to get started!  1. Question from 
William Mougayar to Fred Ehrsam (Coinbase) “What gives a token long-term value” **Answer: “It goes 
back to the team — can they execute…and are they executing already?”** 2. A great crowd in UCSF!  
&amp;#x200B;  *Processing img 8gvsw0zdav421...*   3. Fireside chat with William Mougayar and Oliver 
Bussmann — Crypto Valley meets Silicon Valley   &amp;#x200B;  *Processing img ox7ianmfav421...*   
4. A brief intro to Bluenote — Michiel Frackers showed off the Bluenote protocol on the main stage.   
&amp;#x200B;  *Processing img zluxym6hav421...*     Thanks for following along with us during this 
exciting day!  *Follow us on* [*Twitter*](https://twitter.com/bluenote_world)*,* [*LinkedIn*]
(https://www.linkedin.com/company/11350676/admin/updates/) *and* [*Facebook*](https://www.facebook.com
/bluenote.world/)*!*  Learn more:  [https://bluenote.world/](https://bluenote.world/) 
"""

In [178]:
splitText(testtext3)

 Today get underway with arguably one the most prominent and respected events the crypto space — **Token Summit**, this time from Silicon Valley.  Bluenote hand present our vision game-changing solution for combating climate change with blockchain — and updating this post with content, pictures and quotes from throughout the day!  Don miss out!   ; ;  *Processing img  b biwbav ...*     Just about get started!   . Question from William Mougayar Fred Ehrsam (Coinbase) “What gives token long-term value” **Answer:  goes back the team — can they execute…and are they executing already?”**  . great crowd UCSF!   ; ;  *Processing img  gvsw zdav ...*    . Fireside chat with William Mougayar and Oliver Bussmann — Crypto Valley meets Silicon Valley    ; ;  *Processing img ox ianmfav ...*    . brief intro Bluenote — Michiel Frackers showed off the Bluenote protocol the main stage.    ; ;  *Processing img zluxym hav ...*     Thanks for following along with during this exciting day!  *Follow  [*Twit

['Today',
 'get',
 'underway',
 'with',
 'arguably',
 'one',
 'the',
 'most',
 'prominent',
 'and',
 'respected',
 'events',
 'the',
 'crypto',
 'space',
 'Token',
 'Summit',
 'this',
 'time',
 'from',
 'Silicon',
 'Valley',
 'Bluenote',
 'hand',
 'present',
 'our',
 'vision',
 'game',
 'changing',
 'solution',
 'for',
 'combating',
 'climate',
 'change',
 'with',
 'blockchain',
 'and',
 'updating',
 'this',
 'post',
 'with',
 'content',
 'pictures',
 'and',
 'quotes',
 'from',
 'throughout',
 'the',
 'day',
 'Don',
 'miss',
 'out',
 'Processing',
 'img',
 'biwbav',
 'Just',
 'about',
 'get',
 'started',
 'Question',
 'from',
 'William',
 'Mougayar',
 'Fred',
 'Ehrsam',
 'Coinbase',
 'What',
 'gives',
 'token',
 'long',
 'term',
 'value',
 'Answer',
 'goes',
 'back',
 'the',
 'team',
 'can',
 'they',
 'execute',
 'and',
 'are',
 'they',
 'executing',
 'already',
 'great',
 'crowd',
 'UCSF',
 'Processing',
 'img',
 'gvsw',
 'zdav',
 'Fireside',
 'chat',
 'with',
 'William',
 'Mougayar',

In [14]:
# df = pd.read_csv('cryptocurrency12_12_18to10_29_18.csv')
# df = pd.read_csv('cryptocurrency10_29_18to9_9_18.csv')
# df = pd.read_csv('cryptocurrency9_9_18to7_21_18.csv')
# df = pd.read_csv('cryptocurrency7_20_18to6_18_18.csv')
# df = pd.read_csv('cryptocurrency6_17_18to5_16_18.csv')
# df = pd.read_csv('cryptocurrency5_15_18to4_13_18.csv')
# df = pd.read_csv('cryptocurrency4_12_18to3_11_18.csv')
# df = pd.read_csv('cryptocurrency3_10_18to2_6_18.csv')
# df = pd.read_csv('cryptocurrency2_5_18to1_12_18.csv')
# df = pd.read_csv('cryptocurrency1_12_18to12_27_17.csv')
# df = pd.read_csv('cryptocurrency12_26_17to9_4_17.csv')
# df = pd.read_csv('environmentFinal.csv',na_filter=False)

In [198]:
#testtext = df.loc[39,'selftext']

In [12]:
#df.info()  # environmentFinal

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195717 entries, 0 to 195716
Data columns (total 9 columns):
Unnamed: 0    195717 non-null int64
date          195717 non-null object
permalink     195717 non-null object
subreddit     195717 non-null object
url           195717 non-null object
title         195716 non-null object
author        195717 non-null object
selftext      13836 non-null object
id            195717 non-null object
dtypes: int64(1), object(8)
memory usage: 13.4+ MB


In [13]:
#df.info() # cryptocurrencyFinal

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429116 entries, 0 to 429115
Data columns (total 9 columns):
Unnamed: 0      429116 non-null int64
Unnamed: 0.1    429116 non-null object
date            429115 non-null object
full_link       429115 non-null object
subreddit       429115 non-null object
url             429115 non-null object
title           429115 non-null object
author          429115 non-null object
selftext        177171 non-null object
dtypes: int64(1), object(8)
memory usage: 29.5+ MB


In [14]:
#df.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,full_link,subreddit,url,title,author,selftext
429111,89223,6xw0np,"(2017, 9, 4, 21, 38)",https://www.reddit.com/r/CryptoCurrency/commen...,CryptoCurrency,https://www.reddit.com/r/CryptoCurrency/commen...,What's the cheapest masternode I can get,[deleted],[removed]
429112,89224,6xw26k,"(2017, 9, 4, 22, 32)",https://www.reddit.com/r/CryptoCurrency/commen...,CryptoCurrency,https://www.reddit.com/r/CryptoCurrency/commen...,Anyone else heard of the new altcoinexchange?,snittolo,Noticing a lot of the names staring to appear ...
429113,89225,6xw340,"(2017, 9, 4, 22, 46)",https://www.reddit.com/r/CryptoCurrency/commen...,CryptoCurrency,https://i.redd.it/p6g2g5u3pqjz.jpg,WTF?,[deleted],[deleted]
429114,89226,6xw3zs,"(2017, 9, 4, 22, 53)",https://www.reddit.com/r/CryptoCurrency/commen...,CryptoCurrency,https://i.redd.it/w92kgl7wpqjz.png,Multi-Millionaire Dan Bilzerian into Cryptocur...,bizshawn,
429115,89227,6xw452,"(2017, 9, 4, 22, 35)",https://www.reddit.com/r/CryptoCurrency/commen...,CryptoCurrency,https://www.gotmonero.com/,Forget Milk... Got Monero?,Bitcoinfriend,


In [11]:
#df.to_csv('cryptocurrencyFinal.csv', mode='a', header=False)

In [1]:
# Legacy function, trying to use Reddit API to get enough posts, but on server side they now only
# allow access to 1st 1000 posts no matter what sorted listing you try to retrieve
# Reddit API not used in favor of PushShift API
def add1000posts(url, posts, headers={}, params={}) : # leaves posts as raw json dicts
    after = None
    if params == {} : params = {'after' : after, 'count' : 0}
    for i in range(40) :  # 25 posts at a time, 40 times for 1000 post limit
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200 :
            json1 = res.json()
            posts.extend(json1['data']['children'])
            after = json1['data']['after']
            params['after'] = after
            params['count'] += 25
        else :
            print(res.status_code)
            break
        time.sleep(1)
        
    return (posts, params)