## Yahoo API

In [12]:
import yfinance as yf
import pandas as pd

In [13]:
stock_names = ['ONCO', 'CNEY', 'TNXP', 'APLD', 'KTTA']

start_date = '2022-09-26'
end_date = '2024-09-26'

stocks = []

for stock in stock_names:
    data = yf.download(stock, start=start_date, end=end_date)

    ticker = yf.Ticker(stock)
    stock_info = ticker.info

    data['Industry'] = stock_info.get('industry', 'N/A')
    data['Sector'] = stock_info.get('sector', 'N/A')
    data['Marketcap'] = stock_info.get('marketCap', 'N/A')
    data['Fulltimeemployees'] = stock_info.get('fullTimeEmployees', 'N/A')
    data['Volume'] = stock_info.get('volume', 'N/A')
    data['PEratio'] = stock_info.get('trailingPE', 'N/A')
    data['Dividendyield'] = stock_info.get('dividendYield', 'N/A')
    data['Companyinfo'] = stock_info.get('longBusinessSummary', 'N/A')
    data['Ticker'] = stock
    data.reset_index(inplace=True)

    stocks.append(data)


df = pd.concat(stocks, ignore_index=True)
print(df.shape, df.head())

df.to_csv('stocks.csv', index=False)

[*********************100%***********************]  1 of 1 completed


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


(2515, 15)         Date       Open       High        Low      Close  Adj Close  Volume  \
0 2022-09-26  70.400002  73.199997  68.800003  70.000000  70.000000   22569   
1 2022-09-27  70.800003  76.000000  69.599998  71.199997  71.199997   22569   
2 2022-09-28  70.800003  75.160004  70.800003  73.599998  73.599998   22569   
3 2022-09-29  72.400002  72.720001  67.199997  68.800003  68.800003   22569   
4 2022-09-30  68.400002  70.400002  67.599998  68.400002  68.400002   22569   

        Industry      Sector  Marketcap  Fulltimeemployees PEratio  \
0  Biotechnology  Healthcare   44626884                 12     N/A   
1  Biotechnology  Healthcare   44626884                 12     N/A   
2  Biotechnology  Healthcare   44626884                 12     N/A   
3  Biotechnology  Healthcare   44626884                 12     N/A   
4  Biotechnology  Healthcare   44626884                 12     N/A   

  Dividendyield                                        Companyinfo Ticker  
0           N/A  

In [14]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Industry,Sector,Marketcap,Fulltimeemployees,PEratio,Dividendyield,Companyinfo,Ticker
0,2022-09-26,70.400002,73.199997,68.800003,70.0,70.0,22569,Biotechnology,Healthcare,44626884,12,,,"Onconetix, Inc., a biotechnology company, focu...",ONCO
1,2022-09-27,70.800003,76.0,69.599998,71.199997,71.199997,22569,Biotechnology,Healthcare,44626884,12,,,"Onconetix, Inc., a biotechnology company, focu...",ONCO
2,2022-09-28,70.800003,75.160004,70.800003,73.599998,73.599998,22569,Biotechnology,Healthcare,44626884,12,,,"Onconetix, Inc., a biotechnology company, focu...",ONCO
3,2022-09-29,72.400002,72.720001,67.199997,68.800003,68.800003,22569,Biotechnology,Healthcare,44626884,12,,,"Onconetix, Inc., a biotechnology company, focu...",ONCO
4,2022-09-30,68.400002,70.400002,67.599998,68.400002,68.400002,22569,Biotechnology,Healthcare,44626884,12,,,"Onconetix, Inc., a biotechnology company, focu...",ONCO


## Reddit API

 A 2 year range was chosen because a 3 year range leads to discontinuous stock data as APLD and ONCO have been listed for less than 3 years. However, a 3 year range might have been useful to bring the value counts of reddit posts to an acceptable range. The KTTA stock may have to be cut off from the list for more promising results due to there being less than 15 posts. 

In [15]:
import praw
import pandas as pd
from datetime import datetime
import time

In [16]:
#import keys.txt into praw.Reddit
with open('keys.txt') as f:
    keys = f.readlines()
    keys = [key.strip() for key in keys]

reddit = praw.Reddit(client_id=keys[0], client_secret=keys[1], user_agent=keys[2])


In [17]:
stock_subreddits=['wallstreetbets', 'stocks', 'stockmarket', 'investing', 'valueinvesting', 'investing_discussion', \
                'pennystocks', 'pennystockswatch', 'robinhood', 'vanturetrading', 'squeezeplays', 'stocknewsimpact', \
                'stocksandtrading', 'wallstreetbetselite', 'short_selling', 'burryedge', 'shortsqueeze', 'stockinvest', \
                'stockbreakouts', 'millennialbets', 'stocktitan', 'wallstreet', 'superstonk', 'stonks']

In [18]:
biotech_subreddits=['biotech', 'biotechplays', 'biotechnology', 'pharmaindustry', 'pharmacy', 'healthcare', 'medicine']

In [19]:
queries={'APLD':{'queries':['APLD', 'applied digital'],
                'subreddits':stock_subreddits+['technology', 'cryptocurrency', 'cryptostocks']},
        'CNEY':{'queries':['CNEY', 'CN energy'],
                'subreddits':stock_subreddits+['energy', 'renewableenergy']}, 
        'KTTA':{'queries':['KTTA', 'pasithea therapeutics'],
                'subreddits':stock_subreddits+biotech_subreddits+\
                ['multiplesclerosis', 'neurofibromatosis', 'als', 'schizophrenia', 'ketaminetherapy']},
        'ONCO':{'queries':['ONCO', 'onconetix'],
                'subreddits':stock_subreddits+biotech_subreddits+\
                ['oncology', 'menshealth', 'askmen', 'iama', 'prostatecancer']},
        'TNXP':{'queries':['TNXP', 'Tonix'],
                'subreddits':stock_subreddits+biotech_subreddits+\
                ['coronavirus', 'covid19', 'covidiots', 'covid19positive', 'covidvaccinated', \
                 'vaccine', 'zerocovidcommunity', 'migraine', 'fibromyalgia', 'tnxp']}
}

In [20]:
start_date = int(datetime.strptime('2022-09-26', "%Y-%m-%d").timestamp())
end_date = int(datetime.strptime('2024-09-25', "%Y-%m-%d").timestamp())


In [21]:
def search_reddit(queries, start_date, end_date):
    results = []  

    for key in queries.keys():
        for query in queries[key]['queries']:
            for subreddit in queries[key]['subreddits']:
                print(f"Searching for '{query}' in '{subreddit}' subreddit...")
                try:
                    submissions = reddit.subreddit(subreddit).search(
                        query, time_filter='all', sort='relevance', limit=None
                    )
                except Exception as e:
                    print(f"An error occurred: {e}")
                    continue

                for submission in submissions:
                    # check if the submission is within the date range
                    if (start_date <= submission.created_utc <= end_date):
                        # extract submission details
                        post_data = {
                            'ticker': key,
                            'subreddit': subreddit,
                            'query': query,
                            'title': submission.title,
                            'score': submission.score,
                            'id': submission.id,
                            'author': submission.author.name if submission.author else None,
                            'url': submission.url,
                            'post_date': datetime.fromtimestamp(submission.created_utc),
                            'post_upvotes': submission.score,
                            'selftext': submission.selftext,
                            'num_comments': submission.num_comments,
                        }
                        submission.comments.replace_more(limit=0)
                        comments=[]
                        for comment in submission.comments.list():
                            comment_data={
                                'comment': comment.body,
                                'comment_author': comment.author.name if comment.author else None,
                                'comment_date': datetime.fromtimestamp(comment.created_utc), 
                                'comment_upvotes': comment.score
                            }
                            comments.append(comment_data)
                        post_data['comments'] = comments

                        # Append the post data to the results list
                        results.append(post_data)
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(results)
    return df

In [22]:
df=search_reddit(queries, start_date, end_date)

Searching for 'APLD' in 'wallstreetbets' subreddit...
Searching for 'APLD' in 'stocks' subreddit...
Searching for 'APLD' in 'stockmarket' subreddit...
Searching for 'APLD' in 'investing' subreddit...
Searching for 'APLD' in 'valueinvesting' subreddit...
Searching for 'APLD' in 'investing_discussion' subreddit...
Searching for 'APLD' in 'pennystocks' subreddit...
Searching for 'APLD' in 'pennystockswatch' subreddit...
Searching for 'APLD' in 'robinhood' subreddit...
Searching for 'APLD' in 'vanturetrading' subreddit...
Searching for 'APLD' in 'squeezeplays' subreddit...
Searching for 'APLD' in 'stocknewsimpact' subreddit...
Searching for 'APLD' in 'stocksandtrading' subreddit...
Searching for 'APLD' in 'wallstreetbetselite' subreddit...
Searching for 'APLD' in 'short_selling' subreddit...
Searching for 'APLD' in 'burryedge' subreddit...
Searching for 'APLD' in 'shortsqueeze' subreddit...
Searching for 'APLD' in 'stockinvest' subreddit...
Searching for 'APLD' in 'stockbreakouts' subreddi

In [23]:
df.head()

Unnamed: 0,ticker,subreddit,query,title,score,id,author,url,post_date,post_upvotes,selftext,num_comments,comments
0,APLD,wallstreetbets,APLD,Found a newspaper from 2011. Imagine all the g...,5106,1aryoah,nywarpath,https://i.redd.it/s7q97fka9vic1.jpeg,2024-02-16 03:26:31,5106,,688,[{'comment': ' **User Report**| | | | :--|:--|...
1,APLD,wallstreetbets,APLD,Micro Trading Options on QQQ!,314,1fbjdhn,Vivo__,https://i.redd.it/vd1e4xdqugnd1.jpeg,2024-09-07 22:48:32,314,The good side to options is its worst side! Yo...,153,[{'comment': ' **User Report**| | | | :--|:--|...
2,APLD,wallstreetbets,APLD,Most Anticipated Earnings Releases for the wee...,259,194hj19,ItsNotYourFault,https://i.redd.it/d2nn39ignwbc1.png,2024-01-12 00:37:59,259,,244,[{'comment': 'Not open on MLK Day? He’d want p...
3,APLD,wallstreetbets,APLD,Dow drops more than 400 points as Wall Street ...,214,1bu74lx,mediterranean2,https://www.reddit.com/r/wallstreetbets/commen...,2024-04-02 19:07:01,214,The Dow Jones Industrial Average\n fell for a ...,54,[{'comment': ' **User Report**| | | | :--|:--|...
4,APLD,stocks,APLD,(9/20) Friday's Pre-Market News & Stock Movers,18,1flatw5,bigbear0083,https://www.reddit.com/r/stocks/comments/1flat...,2024-09-20 12:30:23,18,#Good Friday morning traders and investors of ...,0,[]


In [24]:
df.shape

(816, 13)

In [25]:
df.groupby('ticker').size()

ticker
APLD    423
CNEY     94
KTTA     13
ONCO     51
TNXP    235
dtype: int64

In [27]:
df.to_csv('reddit.csv', index=False)