In [1]:
import requests
from bs4 import BeautifulSoup
import os.path
import numpy as np
import json
import re

BASE_URL = 'https://boards.4channel.org/biz/'

# Fetch and format list of all coin symbols from CoinGeko api
coingeko__coin_list = json.loads(requests.get('https://api.coingecko.com/api/v3/coins/list').text)
coingeko__coin_list = [coin_entry['symbol'].upper() for coin_entry in coingeko__coin_list]


In [2]:
# List of words to exclude as we do not care about them
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'i\'m', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'got', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime', 'time']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['gonna', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where', 'day', ]
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your', 'WTF', 'drop', 'gme,', 'gme.']
stopwords += ['yours', 'yourself', 'yourselves', 'shit', 'dip']
stopwords += ['want', 'make', 'think', 'just', 'like', 'going', 'don\'t', 'it\'s']
stopwords += ['dont', 'can\'t', 'im', 'coming', 'right', 'it.', 'know', 'right', 'said', 'does']
stopwords += ['SEC', 'BTC', 'AMC', 'NOK', 'GME', 'BUY', 'DAY', 'TRY', 'fall', 'WSB', 'fuck', 'hype']
stopwords += ['sell', 'buy', 'jew', 'jews', 'bb', 'amd', 'push', 'send', 'hold', 'hodl', 'moon']
stopwords += ['KYC', 'PUT', 'CEO', 'STOP', 'SOLD', 'LOL', 'LOOK', 'FUD', 'pump']
stopwords = [x.upper() for x in stopwords]

In [3]:
# =========== PRINT METHODS
def print_title(title):
    print('>>>' + title)
    
def print_done():
    print('...Done!')
    

def print_posts_mentioning_coin(target, posts_by_thread_dict):
    target = target.upper()
    for thread_key in posts_by_thread_dict:    
        for post in posts_by_thread_dict[thread_key]:
            
            target_found = re.search(rf"\b{target}\b", post, re.IGNORECASE)
            
            if target_found:
                print(thread_key)
                print(post + '\n')

In [4]:
# =========== SCRAPE 4CHAN METHODS

def fetch_all_thread_urls_from_page(page_soup):
    thread_reply_links = page_soup.find_all('a', attrs={'class': 'replylink'})
    thread_urls = []

    for thread_reply_link in thread_reply_links:
        target_url = BASE_URL + thread_reply_link.attrs['href']
        
        # remove optional end path to avoid duplicates and slug change issues         
        end_path = os.path.split(target_url)[1]
        if end_path.isnumeric():
            thread_urls.append(BASE_URL + thread_reply_link.attrs['href'])
        
    return thread_urls


def fetch_all_posts_from_thread(thread_url):       
    thread_page = requests.get(thread_url)
    thread_page_soup = BeautifulSoup(thread_page.content, 'html.parser')
    thread_post_soup = thread_page_soup.find_all(attrs={'class': 'postMessage'})
    
    all_thread_posts = []

    for thread_post in thread_post_soup:
        all_thread_posts.append(thread_post.get_text())
                        
    return all_thread_posts


# TODO: make requests async to speed up time
def fetch_all_posts_on_biz():
    # fetch threads linked on the home page
    print_title('fetching home page thread links...')
    home_page = requests.get(BASE_URL)
    home_soup = BeautifulSoup(home_page.content, 'html.parser')
    all_thread_urls = fetch_all_thread_urls_from_page(home_soup)
    print_done()

    # fetch threads links from all other pages
    for i in range(2, 10):
        print_title('fetching page ' + str(i) + ' thread links...')
        page = requests.get(BASE_URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        all_thread_urls = all_thread_urls + fetch_all_thread_urls_from_page(soup)
        print_done()


    # fetch every post from all threads on 4chan 
    all_posts_by_thread_dict = {}
    inc_count = 1
    total_count = len(all_thread_urls)

    for thread_url in all_thread_urls:
        print_title('fetching thread content ' + str(inc_count) + ' of ' + str(total_count) + ' : ' + thread_url)    
        specific_thread_messages = fetch_all_posts_from_thread(thread_url)
        all_posts_by_thread_dict[thread_url] = specific_thread_messages
        inc_count += 1
        print_done()
    
    return all_posts_by_thread_dict

In [5]:
# =========== DATA CLEANING AN CALCULATION 

def filter_for_potential_ticker_references(sentance):
    # TODO: can probably be removed as we are now comparing against coingeko tickers     
    wordlist = sentance.split()
    wordlist = [w for w in wordlist if len(w) < 6 and len(w) > 2] # no ticker longer than 5
    wordlist = [w for w in wordlist if w.isupper()]
    wordlist = [word.replace(".", "") for word in wordlist]
    wordlist = [w for w in wordlist if w.upper() not in stopwords] # remove stop words
    
    return wordlist

def filter_for_coins_list_on_coingecko(word_list):
    return [w for w in word_list if w not in coingeko__coin_list]

def calculate_freq_of_word(wordlist, word_freq = {}):
    for word in wordlist:
        if word in word_freq:
            word_freq[word] = word_freq[word] + 1
        else:
            word_freq[word] = 1
            
    return word_freq


def sort_freq_dict(freq_dict):
    aux = [(freq_dict[key], key) for key in freq_dict]
    aux.sort()
    aux.reverse()
    return aux


def flatten_post_by_thread_dict(content_of_threads_on_page):
    all_posts_in_dict = []
    
    for thread_key in content_of_threads_on_page:
        for post in content_of_threads_on_page[thread_key]:
            all_posts_in_dict.append(post)
            
    return all_posts_in_dict


def calculate_freq_of_coin_tickers_in_posts(list_of_posts):
    coin_freq_dict = {}

    for sentance in list_of_posts:
        potential_tickers = filter_for_potential_ticker_references(sentance)
        confirmed_tickers = filter_for_coins_list_on_coingecko(potential_tickers)
        coin_freq_dict = calculate_freq_of_word(potential_tickers, coin_freq_dict)
        
    return coin_freq_dict

In [6]:
# ========== MAIN

# fetch every post from /biz/ returning a dict with the format
# { <thread url>: ['message 1', message2]}
all_posts_by_thread_dict = fetch_all_posts_on_biz()    

# flatten messages into a 1D array of strings (the post content)
all_posts_flattened = flatten_post_by_thread_dict(all_posts_by_thread_dict)

# Find coin tickers, calculate their frequency and sort by freq
coin_freq_dict = sort_freq_dict(calculate_freq_of_coin_tickers_in_posts(all_posts_flattened))

>>>fetching home page thread links...
...Done!
>>>fetching page 2 thread links...
...Done!
>>>fetching page 3 thread links...
...Done!
>>>fetching page 4 thread links...
...Done!
>>>fetching page 5 thread links...
...Done!
>>>fetching page 6 thread links...
...Done!
>>>fetching page 7 thread links...
...Done!
>>>fetching page 8 thread links...
...Done!
>>>fetching page 9 thread links...
...Done!
>>>fetching thread content 1 of 115 : https://boards.4channel.org/biz/thread/27633251
...Done!
>>>fetching thread content 2 of 115 : https://boards.4channel.org/biz/thread/27625181
...Done!
>>>fetching thread content 3 of 115 : https://boards.4channel.org/biz/thread/27633556
...Done!
>>>fetching thread content 4 of 115 : https://boards.4channel.org/biz/thread/27633019
...Done!
>>>fetching thread content 5 of 115 : https://boards.4channel.org/biz/thread/27629083
...Done!
>>>fetching thread content 6 of 115 : https://boards.4channel.org/biz/thread/27631970
...Done!
>>>fetching thread content 7 of

...Done!
>>>fetching thread content 83 of 115 : https://boards.4channel.org/biz/thread/27632478
...Done!
>>>fetching thread content 84 of 115 : https://boards.4channel.org/biz/thread/27618912
...Done!
>>>fetching thread content 85 of 115 : https://boards.4channel.org/biz/thread/27631319
...Done!
>>>fetching thread content 86 of 115 : https://boards.4channel.org/biz/thread/27625304
...Done!
>>>fetching thread content 87 of 115 : https://boards.4channel.org/biz/thread/27614095
...Done!
>>>fetching thread content 88 of 115 : https://boards.4channel.org/biz/thread/27633251
...Done!
>>>fetching thread content 89 of 115 : https://boards.4channel.org/biz/thread/27628775
...Done!
>>>fetching thread content 90 of 115 : https://boards.4channel.org/biz/thread/27600943
...Done!
>>>fetching thread content 91 of 115 : https://boards.4channel.org/biz/thread/27633461
...Done!
>>>fetching thread content 92 of 115 : https://boards.4channel.org/biz/thread/27631319
...Done!
>>>fetching thread content 93 o

In [7]:
# Print top 10 coins
for entry in coin_freq_dict[0:10]:
    print(entry)

(31, 'EXIT')
(31, 'CRASH')
(31, 'ANON')
(17, 'AAVE')
(13, 'AQB')
(10, 'ETH')
(9, 'DOGE')
(8, 'LAIN')
(7, 'PRQ')
(6, 'NEED')


In [10]:
# Print posts about the top coin
top_rated_coin_ticker = 'AQB'
print_title('Printing messages related to most mentiond coin: ' + top_rated_coin_ticker + '\n')
print_posts_mentioning_coin(top_rated_coin_ticker, all_posts_by_thread_dict)

>>>Printing messages related to most mentiond coin: AQB

https://boards.4channel.org/biz/thread/27633251
>>27633522I see you shilling AQB a lot. Must have worked because I bought 20 shares. How many much money have you invested?

https://boards.4channel.org/biz/thread/27633251
AQB AQB AQB

https://boards.4channel.org/biz/thread/27633251
>sell AQB at loss at 11.20>reinvest in NOK>AQB keeps falling>NOK goes up>???>PROFIT

https://boards.4channel.org/biz/thread/27633251
WHY IS AQB RED LIKE SALMON AHHHHHHHHHHHHHHHHHHHHH

https://boards.4channel.org/biz/thread/27633251
Explain to me exactly what AQB has to offer our society.In fact, explain to me why I should even consider adding fish to my diet.

https://boards.4channel.org/biz/thread/27633251
Anyone shilling AQB please get a trip so I can add you to the filter

https://boards.4channel.org/biz/thread/27633251
>>27634271Aquabounty has a long history, in 1989 Researchers at Memorial University succeeded in using advanced genetics to develop 