In [40]:
import requests
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# Replace with your actual API key
API_KEY = 'AIzaSyA8q8h_p-mniKAVyH0OmqQwyLYNYk6PCSI'
SEARCH_QUERY = 'crypto tokens market trends'  # Replace with your search query
MAX_RESULTS = 100  # Number of results to fetch

# List of tokens to check for in video titles
TOKENS = pd.read_csv("top_1000_crypto_tokens_names.csv")["name"].tolist()

# URL to search videos by a query
base_url = 'https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&maxResults=50&q={}&key={}'.format(SEARCH_QUERY, API_KEY)

# Prepare data list for DataFrame
video_data = []
total_fetched = 0
next_page_token = None

def fetch_video_details(video_id):
    """Fetch complete video details using the YouTube videos API."""
    url = f'https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={API_KEY}'
    response = requests.get(url)
    data = response.json()
    if 'items' in data and len(data['items']) > 0:
        snippet = data['items'][0]['snippet']
        return snippet.get('description', '')  # Return full description
    return ''

def fetch_transcript(video_id):
    """Fetch transcript for a video."""
    try:
        # Get the transcript using the YouTubeTranscriptApi
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        # Combine all text segments into one
        transcript = " ".join([segment['text'] for segment in transcript_list])
        return transcript
    except (TranscriptsDisabled, NoTranscriptFound):
        return "Transcript not available"

while total_fetched < MAX_RESULTS:
    url = base_url
    if next_page_token:
        url += f'&pageToken={next_page_token}'
    
    # Make a GET request to fetch the data
    response = requests.get(url)
    data = response.json()
    
    # Extract required information from each video item
    for item in data.get('items', []):
        # Check if the item is a video and contains 'videoId'
        if item['id']['kind'] == 'youtube#video' and 'videoId' in item['id']:
            video_title = item['snippet']['title']
            video_id = item['id']['videoId']
            video_url = f'https://www.youtube.com/watch?v={video_id}'
            published_time = item['snippet']['publishedAt']  # Extract published time

            # Fetch full video description
            video_description = fetch_video_details(video_id)

            if video_description is None:
                video_description = ''
            if video_description:
                video_description = video_description.replace('\n', ' ')
            # Fetch video transcript
            video_transcript = fetch_transcript(video_id)

            if video_transcript is None:
                video_transcript = ''
            if video_transcript:
                video_transcript = video_transcript.replace('\n', ' ')

            # Find tokens mentioned in either the title, description, or transcript
            tags = [token for token in TOKENS if token.lower() in (video_title + " " + " " + video_transcript).lower().split()]

            # Extract thumbnail URLs
            thumbnails = item['snippet']['thumbnails']
            thumbnail_url = thumbnails.get('high', thumbnails.get('medium', thumbnails.get('default')))['url']

            # Append data to list
            video_data.append({
                'title': video_title,
                'description': video_description,
                'transcript': video_transcript,
                'url': video_url,
                'tags': tags,
                'thumbnail_url': thumbnail_url,
                'published_time': published_time
            })
            total_fetched += 1
            if total_fetched >= MAX_RESULTS:
                break
    
    # Check if there is a next page token
    next_page_token = data.get('nextPageToken')
    if not next_page_token:
        break

# Create DataFrame
df = pd.DataFrame(video_data)
df.to_csv('./uda/youtube_data_with_transcripts.csv', index=False)

# Output the DataFrame
print("Data saved to 'youtube_data_with_transcripts_10.csv'. Here's the DataFrame:")
print(df)


Data saved to 'youtube_data_with_transcripts_10.csv'. Here's the DataFrame:
                                                 title  \
0    Why is Market Cap IMPORTANT In Crypto? (BEST E...   
1               How To BEST Read Cryptocurrency Charts   
2    Coins VS Tokens: What&#39;s the Difference? | ...   
3              How to Trade Crypto Coins for Beginners   
4    What Crypto to Buy? 🚀 Crypto Market Last Big W...   
..                                                 ...   
572  Titan Capital Markets | TTT2 token Price Predi...   
573  Hamster Kombat HOLD or SELL ?? | Hamster Komba...   
574  Cryptopia: Bitcoin, Blockchains, and the Futur...   
575  Why Tits Are Up... 🤩 We Love Tits Crypto Token...   
576  🟢 What is TITS Coin 🚀 We Love TITS Crypto Toke...   

                                           description  \
0    In general, market cap refers to market capita...   
1    It's important to know how to read cryptocurre...   
2    Coins and Tokens may seem similar on the surfa..

Youtube data with title, description, published date, tags based on the tokens discussed and also the transcript of the videos collected and stored in a file.


Catch 

In [4]:
import praw
import pandas as pd

# Replace with your Reddit API credentials
client_id = 'EWTyRcDRpOyoO8u2Ebsc9Q'
client_secret = 'FdPffPBPfOfCTLtasi7nwQVn0Ty3RA'
user_agent = 'PsychologicalOwl18'

# Initialize Reddit instance
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

# Load the list of crypto tokens
crypto_tokens = pd.read_csv("top_1000_crypto_tokens_names.csv")["name"].tolist()

# Define subreddits to search
subreddits = "cryptomarkets+tokens+crypto+Defi+cryptotrending"

# Define the number of posts to fetch
num_posts = 10

# Fetch posts
posts_data = []
for post in reddit.subreddit(subreddits).hot(limit=num_posts):
    content = post.selftext if post.is_self else None
    if content:
        content = " ".join(content.split())
    
    # Fetch replies (comments) for the post
    post.comments.replace_more(limit=None)  # Ensure all comments are fetched
    replies = []
    for comment in post.comments.list():  # Flatten nested comments
        if comment.body:
            replies.append(comment.body.replace("\n", " ").strip())
    
    # Combine all text (title, content, replies) for tag matching
    combined_text = f"{post.title} {content or ''} {' '.join(replies)}"
    
    # Find tags mentioned in title, content, or replies
    tags = [token for token in crypto_tokens if token.lower() in combined_text.lower()]
    
    posts_data.append({
        "Title": post.title,
        "URL": post.url,
        "Thumbnail": post.thumbnail if post.thumbnail.startswith('http') else None,
        "Score": post.score,
        "Upvote Ratio": post.upvote_ratio,
        "Comments Count": post.num_comments,
        "Author": str(post.author),
        "Created At": post.created_utc,
        "Tags": ", ".join(tags) if tags else None,
        "Content": content,
        "Replies": " || ".join(replies) if replies else None  # Combine replies into a single string
    })

# Create a DataFrame and save to CSV
df = pd.DataFrame(posts_data)
df.to_csv("./uda/reddit_crypto_posts_with_replies_and_tags.csv", index=False)
print(df)

print("CSV file 'reddit_crypto_posts_with_replies_and_tags.csv' has been created successfully.")


                                               Title  \
0  U.S. officials urge Americans to use encrypted...   
1  What alt/memecoins are you guys expecting to g...   
2                      I’m afraid of Defi. Should I?   
3  How OriginTrail TRAC Uses AI to Build a Better...   
4  I need your guys opinion about first time inve...   
5  Ctrl Launches All-in-One Browser Wallet, Simpl...   
6                                   Portfolio Review   
7               How do I use a DEX like pancakeswap?   
8                                  where do i start    
9                                Cosmos or Polkadot?   

                                                 URL  \
0  https://www.nbcnews.com/tech/security/us-offic...   
1  https://www.reddit.com/r/CryptoMarkets/comment...   
2  https://www.reddit.com/r/defi/comments/1h5xtt4...   
3  https://www.reddit.com/r/CryptoMarkets/comment...   
4  https://www.reddit.com/r/CryptoMarkets/comment...   
5  https://www.google.com/amp/s/bitcoinist.com/

We collect news articles below related to crypto and DeFi market trends.

In [8]:
import requests
from datetime import datetime, timedelta
import pandas as pd
import json

yesterday = (datetime.now() - timedelta(days=15)).strftime('%Y-%m-%d')

API_KEY = '3e0268e173224f6eb66573622eceaa92'  
url = ('https://newsapi.org/v2/everything?'
       'q=crypto&'
       'from={}&'
       'sortBy=popularity&'
       'apiKey={}'.format(yesterday, API_KEY))

data = requests.get(url)
data = data.json()

TOKENS = crypto_tokens
articles = data.get('articles', [])

# Extract required fields
titles, thumbnails, descriptions, urls, published_times, tags = [], [], [], [], [], []
# for article in articles:
#     titles.append(article['title'])
#     thumbnails.append(article['urlToImage'])
#     descriptions.append(article['description'])
#     urls.append(article['url'])
#     published_times.append(article['publishedAt'])
#     tags = [token for token in TOKENS if token.lower() in (article['title'] + " " + article['description']).lower()]

# # Create DataFrame
# df = pd.DataFrame({
#     "title": titles,
#     "thumbnail": thumbnails,
#     "description": descriptions,
#     "url": urls,
#     "published_time": published_times,
#     "tags": tags
# })

df = pd.DataFrame({
    'title': [article.get('title', 'N/A') for article in articles],
    'thumbnail': [article.get('urlToImage', 'N/A') for article in articles],
    'description': [article.get('description', 'N/A') for article in articles],
    'publishedAt': [article.get('publishedAt', 'N/A') for article in articles],
    'tags': [
        [token for token in TOKENS if token.lower() in (article.get('title', '') + " " + article.get('description', '')).lower()]
        for article in articles
    ]
})
# Generate tags from descriptions
# def generate_tags(descriptions):
#     vectorizer = CountVectorizer(max_features=5, stop_words='english')
#     X = vectorizer.fit_transform(descriptions)
#     tags_list = []
#     for row in X.toarray():
#         tags = [vectorizer.get_feature_names_out()[i] for i, val in enumerate(row) if val > 0]
#         tags_list.append(", ".join(tags))
#     return tags_list

# df['tags'] = generate_tags(df['description'])

# Display the resulting DataFrame
print(df)

df.to_csv('./uda/news_articles.csv', index=False)



                                                title  \
0   Meta Finally Breaks Its Silence on Pig Butchering   
1   The Crypto Industry Is Helping Trump Pick SEC ...   
2   Teen streamer dumped a meme coin for $30K, the...   
3   Microsoft is the mystery AI company licensing ...   
4   NBA settles with WBD as it prepares to stream ...   
..                                                ...   
95  MicroStrategy Is Halfway Through Share Sales t...   
96  After Nvidia earnings, is the AI trend still i...   
97  Bitcoin nears $100,000 as investors bet on cry...   
98  US SEC chair Gensler to step down upon Trump t...   
99                    BITCOIN FLIRTS WITH $100,000...   

                                            thumbnail  \
0   https://media.wired.com/photos/673e453447963b9...   
1   https://media.wired.com/photos/6745db10e149b18...   
2   https://cdn.vox-cdn.com/thumbor/ejx8--qQQFRjww...   
3   https://cdn.vox-cdn.com/thumbor/5X0IGMLrTUsRny...   
4   https://cdn.vox-cdn.com/th

Get current market data

In [9]:
import requests
import pandas as pd
import json

# Set Up API Key and Endpoint

# Define the API key and endpoint URL for the CoinMarketCap API
api_key = '1f96b26e-8764-4c1a-8a01-fb1046ceb1b6'  # Replace with your actual CoinMarketCap API key
url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'

# Set up the headers for the API request
headers = {
    'Accepts': 'application/json',
    'X-CMC_PRO_API_KEY': api_key,
}

# Define the parameters for the API request
parameters = {
    'start': '1',
    'limit': '1000',
    'convert': 'USD'
}


# Fetch Data from CoinMarketCap API

# Make the API request
response = requests.get(url, headers=headers, params=parameters)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    #print(data)
    
    # Extract the relevant data
    crypto_data = data['data']
    
    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(crypto_data)
    #print(df.head())
    
    # Save the DataFrame to a CSV file
    df.to_csv('top_1000_crypto_tokens.csv', index=False)
else:
    print(f"Failed to fetch data: {response.status_code}")

# Process and Extract Relevant Data

# Extract relevant data such as token name, symbol, market cap, price, volume, etc.
processed_data = []
for token in crypto_data:
    print(token)
    token_info = {
        'name': token['name'],
        'symbol': token['symbol'],
        'market_cap': token['quote']['USD']['market_cap'],
        'price': token['quote']['USD']['price'],
        'volume_24h': token['quote']['USD']['volume_24h'],
        'percent_change_1h': token['quote']['USD']['percent_change_1h'],
        'percent_change_24h': token['quote']['USD']['percent_change_24h'],
        'percent_change_7d': token['quote']['USD']['percent_change_7d']
    }
    processed_data.append(token_info)

# Convert the processed data to a pandas DataFrame
df_processed = pd.DataFrame(processed_data)

# Save the processed DataFrame to a CSV file
df_processed.to_csv('./uda/top_1000_crypto_tokens_processed.csv', index=False)

df_processed['name'].to_csv('./uda/top_1000_crypto_tokens_names.csv', index=False)

{'id': 1, 'name': 'Bitcoin', 'symbol': 'BTC', 'slug': 'bitcoin', 'num_market_pairs': 11818, 'date_added': '2010-07-13T00:00:00.000Z', 'tags': ['mineable', 'pow', 'sha-256', 'store-of-value', 'state-channel', 'coinbase-ventures-portfolio', 'three-arrows-capital-portfolio', 'polychain-capital-portfolio', 'binance-labs-portfolio', 'blockchain-capital-portfolio', 'boostvc-portfolio', 'cms-holdings-portfolio', 'dcg-portfolio', 'dragonfly-capital-portfolio', 'electric-capital-portfolio', 'fabric-ventures-portfolio', 'framework-ventures-portfolio', 'galaxy-digital-portfolio', 'huobi-capital-portfolio', 'alameda-research-portfolio', 'a16z-portfolio', '1confirmation-portfolio', 'winklevoss-capital-portfolio', 'usv-portfolio', 'placeholder-ventures-portfolio', 'pantera-capital-portfolio', 'multicoin-capital-portfolio', 'paradigm-portfolio', 'bitcoin-ecosystem', 'ftx-bankruptcy-estate'], 'max_supply': 21000000, 'circulating_supply': 19790568, 'total_supply': 19790568, 'infinite_supply': False, 'p

We find sentiment for each token and give weightages to each tokens that is discussed on the video, article and reddit post

News Sentiments Analysis

In [37]:
import pandas as pd
from collections import Counter
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the news articles data
news_df = pd.read_csv("./uda/news_articles.csv")
news_df['tags'] = news_df['tags'].apply(ast.literal_eval)
# Load the list of crypto tokens
crypto_tokens = pd.read_csv("top_1000_crypto_tokens_names.csv")["name"].tolist()

# Load FinBERT model and tokenizer manually to avoid framework detection issues
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize sentiment analysis pipeline using PyTorch explicitly
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=0 if torch.cuda.is_available() else -1)

# Function to compute sentiment for each token
def calculate_sentiment(content, tokens):
    sentiments = {}
    for token in tokens:
        # Extract sentences containing the token
        sentences = [sentence for sentence in content.split('.') if token.lower() in sentence.lower()]
        # Analyze sentiment for each sentence
        token_sentiments = []
        for sentence in sentences:
            result = sentiment_analyzer(sentence[:512])  # Truncate to 512 tokens for FinBERT
            token_sentiments.append(result[0]['label'])
        
        # Calculate sentiment summary for the token
        sentiment_score = {
            "positive": token_sentiments.count("positive"),
            "negative": token_sentiments.count("negative"),
            "neutral": token_sentiments.count("neutral")
        }
        sentiments[token.lower()] = sentiment_score  # Ensure token names are lowercased for consistency
    return sentiments

# Function to compute weightage for each token
def calculate_weightage(content, tokens):
    token_counts = Counter(token.lower() for token in tokens if token.lower() in content.lower())
    total_count = sum(token_counts.values())
    weightage = {token: count / total_count for token, count in token_counts.items()} if total_count > 0 else {}
    return weightage

# Apply the functions to the News DataFrame
  # Ensure tags are properly formatted as lists
news_df['sentiment_per_token'] = news_df.apply(lambda row: calculate_sentiment(row['title'] + ' ' + (row['description'] if pd.notna(row['description']) else ''), row['tags']), axis=1)
news_df['token_weightage'] = news_df.apply(lambda row: calculate_weightage(row['title'] + ' ' + (row['description'] if pd.notna(row['description']) else ''), row['tags']), axis=1)

# Save the updated DataFrame to a new CSV file
news_df.to_csv('./uda/news_data_with_sentiment.csv', index=False)

# Display the resulting DataFrame
print(news_df[['title', 'thumbnail', 'description', 'publishedAt', 'tags', 'sentiment_per_token', 'token_weightage']])


  return torch.load(checkpoint_file, map_location="cpu")


                                                title  \
0   Meta Finally Breaks Its Silence on Pig Butchering   
1   The Crypto Industry Is Helping Trump Pick SEC ...   
2   Teen streamer dumped a meme coin for $30K, the...   
3   Microsoft is the mystery AI company licensing ...   
4   NBA settles with WBD as it prepares to stream ...   
..                                                ...   
95  MicroStrategy Is Halfway Through Share Sales t...   
96  After Nvidia earnings, is the AI trend still i...   
97  Bitcoin nears $100,000 as investors bet on cry...   
98  US SEC chair Gensler to step down upon Trump t...   
99                    BITCOIN FLIRTS WITH $100,000...   

                                            thumbnail  \
0   https://media.wired.com/photos/673e453447963b9...   
1   https://media.wired.com/photos/6745db10e149b18...   
2   https://cdn.vox-cdn.com/thumbor/ejx8--qQQFRjww...   
3   https://cdn.vox-cdn.com/thumbor/5X0IGMLrTUsRny...   
4   https://cdn.vox-cdn.com/th

In [36]:
import pandas as pd
from collections import Counter
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the scraped Reddit data
reddit_df = pd.read_csv("./uda/reddit_crypto_posts.csv")

# Load the list of crypto tokens
crypto_tokens = pd.read_csv("top_1000_crypto_tokens_names.csv")["name"].tolist()

# Load FinBERT model and tokenizer manually to avoid framework detection issues
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize sentiment analysis pipeline using PyTorch explicitly
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=0 if torch.cuda.is_available() else -1)

# Function to compute sentiment for each token
def calculate_sentiment(content, tokens):
    sentiments = {}
    for token in tokens:
        # Extract sentences containing the token
        sentences = [sentence for sentence in content.split('.') if token.lower() in sentence.lower()]
        # Analyze sentiment for each sentence
        token_sentiments = []
        for sentence in sentences:
            result = sentiment_analyzer(sentence[:512])  # Truncate to 512 tokens for FinBERT
            token_sentiments.append(result[0]['label'])
        
        # Calculate sentiment summary for the token
        sentiment_score = {
            "positive": token_sentiments.count("positive"),
            "negative": token_sentiments.count("negative"),
            "neutral": token_sentiments.count("neutral")
        }
        sentiments[token] = sentiment_score
    return sentiments

# Function to compute weightage for each token
def calculate_weightage(content, tokens):
    token_counts = Counter(token.lower() for token in tokens if token.lower() in content.lower())
    total_count = sum(token_counts.values())
    weightage = {token: count / total_count for token, count in token_counts.items()} if total_count > 0 else {}
    return weightage

# Apply the functions to the Reddit DataFrame
reddit_df['tags'] = reddit_df['Tags'].apply(lambda x: x.split(', ') if pd.notna(x) else [])
reddit_df['sentiment_per_token'] = reddit_df.apply(lambda row: calculate_sentiment(row['Title'] + ' ' + (row['Content'] if pd.notna(row['Content']) else ''), row['tags']), axis=1)
reddit_df['token_weightage'] = reddit_df.apply(lambda row: calculate_weightage(row['Title'] + ' ' + (row['Content'] if pd.notna(row['Content']) else ''), row['tags']), axis=1)

# Save the updated DataFrame to a new CSV file
reddit_df.to_csv('./uda/reddit_data_with_sentiment.csv', index=False)

# Display the resulting DataFrame
print(reddit_df[['Title', 'URL', 'Thumbnail', 'Score', 'Upvote Ratio', 'Comments Count', 'Author', 'Created At', 'Tags', 'Content', 'sentiment_per_token', 'token_weightage']])

  return torch.load(checkpoint_file, map_location="cpu")


                                                 Title  \
0    If you won $10,000 today and we're going to in...   
1    Serenity and IDEMIA Unveil Biometric sAxess Ca...   
2        Weekly cryptography community and meta thread   
3                          Can you explain XRP for me?   
4                     Is now a good time to be buying?   
..                                                 ...   
319                    Brute Forcing Valid Signatures?   
320        Does this encryption method work correctly?   
321  2024: Cryptographic Right Answers: Post Quantu...   
322      Weekly cryptography community and meta thread   
323            Advice getting papers published to IACR   

                                                   URL  \
0    https://www.reddit.com/r/CryptoMarkets/comment...   
1    https://www.crypto-news.net/serenity-and-idemi...   
2    https://www.reddit.com/r/crypto/comments/1h4sg...   
3    https://www.reddit.com/r/CryptoMarkets/comment...   
4    https://

In [42]:
import pandas as pd
from collections import Counter
import ast
from transformers import pipeline
df = pd.read_csv("./uda/youtube_data_with_transcripts.csv")

df['tags'] = df['tags'].apply(ast.literal_eval)
    
# Function to compute sentiment for each token
# def calculate_sentiment(transcript, tokens):
#     sentiments = {}
#     for token in tokens:
#         # Extract sentences containing the token
#         sentences = [sentence for sentence in transcript.split('.') if token.lower() in sentence.lower()]
#         # Calculate sentiment polarity for those sentences
#         token_sentiment = sum(TextBlob(sentence).sentiment.polarity for sentence in sentences) / (len(sentences) or 1)
#         sentiments[token] = token_sentiment
#     return sentiments

# # Function to compute weightage for each token
# def calculate_weightage(transcript, tokens):
#     token_counts = Counter(token.lower() for token in tokens if token.lower() in transcript.lower())
#     total_count = sum(token_counts.values())
#     weightage = {token: count / total_count for token, count in token_counts.items()} if total_count > 0 else {}
#     return weightage

# # Apply the functions to the DataFrame
# df['sentiment_per_token'] = df.apply(lambda row: calculate_sentiment(row['transcript'], row['tags']), axis=1)
# df['token_weightage'] = df.apply(lambda row: calculate_weightage(row['transcript'], row['tags']), axis=1)

# Load tokenizer and model manually
# Load model directly
# Load pre-trained FinBERT sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Function to compute sentiment for each token
def calculate_sentiment(transcript, tokens):
    sentiments = {}
    for token in tokens:
        # Extract sentences containing the token
        sentences = [sentence for sentence in transcript.split('.') if token.lower() in sentence.lower()]
        # Analyze sentiment for each sentence
        token_sentiments = []
        for sentence in sentences:
            result = sentiment_analyzer(sentence[:512])  # Truncate to 512 tokens for FinBERT
            token_sentiments.append(result[0]['label'])
        
        # Calculate sentiment summary for the token
        sentiment_score = {
            "positive": token_sentiments.count("positive"),
            "negative": token_sentiments.count("negative"),
            "neutral": token_sentiments.count("neutral")
        }
        sentiments[token] = sentiment_score
    return sentiments

# Function to compute weightage for each token
def calculate_weightage(transcript, tokens):
    token_counts = Counter(token.lower() for token in tokens if token.lower() in transcript.lower())
    total_count = sum(token_counts.values())
    weightage = {token: count / total_count for token, count in token_counts.items()} if total_count > 0 else {}
    return weightage

# Apply the functions to the DataFrame
df['sentiment_per_token'] = df.apply(lambda row: calculate_sentiment(row['transcript'], row['tags']), axis=1)
df['token_weightage'] = df.apply(lambda row: calculate_weightage(row['transcript'], row['tags']), axis=1)

df.to_csv('./uda/youtube_data_with_sentiment.csv', index=False)
print(df)


  return torch.load(checkpoint_file, map_location="cpu")


                                                 title  \
0    Why is Market Cap IMPORTANT In Crypto? (BEST E...   
1               How To BEST Read Cryptocurrency Charts   
2    Coins VS Tokens: What&#39;s the Difference? | ...   
3              How to Trade Crypto Coins for Beginners   
4    What Crypto to Buy? 🚀 Crypto Market Last Big W...   
..                                                 ...   
572  Titan Capital Markets | TTT2 token Price Predi...   
573  Hamster Kombat HOLD or SELL ?? | Hamster Komba...   
574  Cryptopia: Bitcoin, Blockchains, and the Futur...   
575  Why Tits Are Up... 🤩 We Love Tits Crypto Token...   
576  🟢 What is TITS Coin 🚀 We Love TITS Crypto Toke...   

                                           description  \
0    In general, market cap refers to market capita...   
1    It's important to know how to read cryptocurre...   
2    Coins and Tokens may seem similar on the surfa...   
3    Cryptocurrency Trading and Investing guide for...   
4    🔐 *Ledge

Load customer profile and give weightages to each token on their portfolio based on the amount invested on the token to the total amount, profits or loss on each token and ranking of recency.

In [51]:
import datetime as datetime
import pandas as pd
customer_portfolio = pd.read_csv("./uda/crypto_portfolio.csv")
customer_portfolio['total_investment_per_token'] = customer_portfolio['purchase_price'] * customer_portfolio['quantity']
customer_portfolio['token_weightage'] = customer_portfolio['total_investment_per_token'].apply(lambda x: x  / customer_portfolio['total_investment_per_token'].sum())
customer_portfolio['profit_loss'] = (customer_portfolio['current_price'] - customer_portfolio['purchase_price']) * customer_portfolio['quantity']
customer_portfolio['purchase_date'] = pd.to_datetime(customer_portfolio['purchase_date'])
customer_portfolio['recency'] = customer_portfolio['purchase_date'].rank(method='min', ascending=False)
customer_portfolio['final_rating'] = (customer_portfolio['token_weightage'] * 0.5 + (1/customer_portfolio['recency']) * 0.4)

top_3_tokens_total = customer_portfolio.sort_values('final_rating', ascending=False).head(3)
top_3_tokens_recency = customer_portfolio.sort_values('recency', ascending=True).head(3)
top_3_tokens_weightage = customer_portfolio.sort_values('token_weightage', ascending=False).head(3)

top_3_token = top_3_tokens_total['token_name'].tolist()

print(top_3_token)

['Dogecoin', 'UniSwap', 'Ethereum']


  customer_portfolio['purchase_date'] = pd.to_datetime(customer_portfolio['purchase_date'])


Find top 3 tokens based on each for the customer, i.e weightage of portfolio, profit and recency and intersect it to a final list

In [46]:
#Recommendor

import pandas as pd
import ast

# Example DataFrame

yt_st_ratings = pd.read_csv("./uda/youtube_data_with_sentiment.csv")
reddit_st_ratings = pd.read_csv("./uda/reddit_data_with_sentiment.csv")
news_st_ratings = pd.read_csv("./uda/news_data_with_sentiment.csv")

# Convert stringified dictionaries to actual dictionaries
yt_st_ratings['sentiment_per_token'] = yt_st_ratings['sentiment_per_token'].apply(ast.literal_eval)
yt_st_ratings['token_weightage'] = yt_st_ratings['token_weightage'].apply(ast.literal_eval)
reddit_st_ratings['sentiment_per_token'] = reddit_st_ratings['sentiment_per_token'].apply(ast.literal_eval)
reddit_st_ratings['token_weightage'] = reddit_st_ratings['token_weightage'].apply(ast.literal_eval)
news_st_ratings['sentiment_per_token'] = news_st_ratings['sentiment_per_token'].apply(ast.literal_eval)
news_st_ratings['token_weightage'] = news_st_ratings['token_weightage'].apply(ast.literal_eval)

# Function to calculate sentiment scores
def calculate_sentiment_scores(row):
    sentiment_scores = {}
    for token, sentiment in row['sentiment_per_token'].items():
        # Convert token to lowercase for consistent matching with weightage
        token_lower = token.lower()
        # Calculate the sentiment score
        if token_lower in row['token_weightage']:
            weight = row['token_weightage'][token_lower]
            score = (sentiment['positive'] * 1 +
                     sentiment['negative'] * -1 +
                     sentiment['neutral'] * 0.5) * weight
            sentiment_scores[token] = score

    if sentiment_scores:
        most_positive_token = max(sentiment_scores, key=sentiment_scores.get)
        most_negative_token = min(sentiment_scores, key=sentiment_scores.get)
        
        return pd.Series({
            "most_positive_token": most_positive_token,
            "positive_score": sentiment_scores[most_positive_token],
            "most_negative_token": most_negative_token,
            "negative_score": sentiment_scores[most_negative_token]
        })
    else:
        return pd.Series({
            "most_positive_token": None,
            "positive_score": None,
            "most_negative_token": None,
            "negative_score": None
        })

# Apply the function to calculate the required columns
yt_st_ratings = yt_st_ratings.join(yt_st_ratings.apply(calculate_sentiment_scores, axis=1))
reddit_st_ratings = reddit_st_ratings.join(reddit_st_ratings.apply(calculate_sentiment_scores, axis=1))
news_st_ratings = news_st_ratings.join(news_st_ratings.apply(calculate_sentiment_scores, axis=1))

yt_st_ratings.to_csv('./uda/youtube_data_with_sentiment_scores.csv', index=False)
reddit_st_ratings.to_csv('./uda/reddit_data_with_sentiment_scores.csv', index=False)
news_st_ratings.to_csv('./uda/news_data_with_sentiment_scores.csv', index=False)
# Display the updated DataFrame
print(yt_st_ratings)

                                                 title  \
0    Why is Market Cap IMPORTANT In Crypto? (BEST E...   
1               How To BEST Read Cryptocurrency Charts   
2    Coins VS Tokens: What&#39;s the Difference? | ...   
3              How to Trade Crypto Coins for Beginners   
4    What Crypto to Buy? 🚀 Crypto Market Last Big W...   
..                                                 ...   
572  Titan Capital Markets | TTT2 token Price Predi...   
573  Hamster Kombat HOLD or SELL ?? | Hamster Komba...   
574  Cryptopia: Bitcoin, Blockchains, and the Futur...   
575  Why Tits Are Up... 🤩 We Love Tits Crypto Token...   
576  🟢 What is TITS Coin 🚀 We Love TITS Crypto Toke...   

                                           description  \
0    In general, market cap refers to market capita...   
1    It's important to know how to read cryptocurre...   
2    Coins and Tokens may seem similar on the surfa...   
3    Cryptocurrency Trading and Investing guide for...   
4    🔐 *Ledge

Recommend Top 3 news articles, reddit posts and youtube videos based on the weightage of tokens calculated

Add a bias braker by recommending neagtive sentiment of a token which has more weightage in the user's portfolio

In [61]:
tokens_of_interest = top_3_token
# Filter DataFrame for rows where the most_positive_token is in the list of interest

filtered_df_youtube = yt_st_ratings[yt_st_ratings['most_positive_token'].isin(tokens_of_interest)]
filtered_df_reddit = reddit_st_ratings[reddit_st_ratings['most_positive_token'].isin(tokens_of_interest)]
filtered_df_news = news_st_ratings[news_st_ratings['most_positive_token'].isin(tokens_of_interest)]
# Sort the filtered DataFrame by positive_score in descending order
top_videos = filtered_df_youtube.drop_duplicates(['title']).sort_values(by="positive_score", ascending=False).head(3)
top_videos = top_videos[['url','title']]

top_reddit_posts = filtered_df_reddit.drop_duplicates(['Title']).sort_values(by="positive_score", ascending=False).head(3)
top_reddit_posts = top_reddit_posts[['Title','URL']]

top_news_articles = filtered_df_news.drop_duplicates(['title']).sort_values(by="positive_score", ascending=False).head(3)
top_news_articles = filtered_df_news[['title']]


print(top_reddit_posts)
print(top_videos)
print(top_news_articles)


                                       Title  \
110  Week in Ethereum News November 30, 2024   
153             Latest Week in Ethereum News   
194  Week in Ethereum News November 16, 2024   

                                                   URL  
110  https://weekinethereumnews.com/week-in-ethereu...  
153  https://weekinethereumnews.com/week-in-ethereu...  
194  https://weekinethereumnews.com/week-in-ethereu...  
                                             url  \
524  https://www.youtube.com/watch?v=rSbbtnKoKzM   
249  https://www.youtube.com/watch?v=CTQfhyxrHXc   
236  https://www.youtube.com/watch?v=0byXt54nL3A   

                                                 title  
524     Whales sell Andy (ETH) ☢ Crypto Token Analysis  
249  Why Ethereum Classic is up 🤩 ETC Crypto Token ...  
236  Polkadot - An Opportunity? 🤔 DOT Crypto Token ...  
Empty DataFrame
Columns: [title]
Index: []


In [65]:
#Bias Breaker
tokens_of_interest = top_3_token
# Filter DataFrame for rows where the most_positive_token is in the list of interest

filtered_df_youtube = yt_st_ratings[yt_st_ratings['most_negative_token'].isin(tokens_of_interest)]
filtered_df_reddit = reddit_st_ratings[reddit_st_ratings['most_negative_token'].isin(tokens_of_interest)]
filtered_df_news = news_st_ratings[news_st_ratings['most_negative_token'].isin(tokens_of_interest)]
# Sort the filtered DataFrame by positive_score in descending order
top_videos = filtered_df_youtube.drop_duplicates(['title']).sort_values(by="negative_score", ascending=True).head(3)
top_videos = top_videos[['url','title']]

top_reddit_posts = filtered_df_reddit.drop_duplicates(['Title']).sort_values(by="negative_score", ascending=True).head(3)
top_reddit_posts = top_reddit_posts[['Title','URL']]

top_news_articles = filtered_df_news.drop_duplicates(['title']).sort_values(by="negative_score", ascending=True).head(3)
top_news_articles = filtered_df_news[['title']]

print("Reddit Posts \n")
print(top_reddit_posts)
print("Youtube Videos \n")
print(top_videos)
print("News Articles \n")
print(top_news_articles)


Reddit Posts 

                                                 Title  \
238  When using Groth16, is it really needed to cha...   
61                         What happened to Ethereum ?   
117            Defillama: misusage of the TVL notion?    

                                                   URL  
238  https://www.reddit.com/r/crypto/comments/1fz6m...  
61   https://www.reddit.com/r/CryptoMarkets/comment...  
117  https://www.reddit.com/r/defi/comments/1h2izgr...  
Youtube Videos 

                                             url  \
169  https://www.youtube.com/watch?v=t0VkrlobaL8   
373  https://www.youtube.com/watch?v=-DiJ66Wbj_Y   
419  https://www.youtube.com/watch?v=-vH4K1GePd8   

                                                 title  
169  The Polygon Matic Collapse ☢ POL Crypto Token ...  
373  SPX6900 Whales Sell? ☢️ SPX 6900 Crypto Token ...  
419  Buy The Simon&#39;s Cat Crash? 💀 Simons Cat Cr...  
News Articles 

Empty DataFrame
Columns: [title]
Index: []


In [72]:
#Network Bias Breaker

tokens_of_interest = top_3_token
bias_dict = {"Ethereum": "Bitcoin", "Dogecoin": "Pepe"}

def replace_tokens(token, replacements):
    return replacements.get(token, token)
# Filter DataFrame for rows where the most_positive_token is in the list of interest

filtered_df_youtube = yt_st_ratings
filtered_df_reddit = reddit_st_ratings
filtered_df_news = news_st_ratings

filtered_df_youtube['most_positive_token'] = yt_st_ratings['most_positive_token'].apply(lambda x: replace_tokens(x, bias_dict))
filtered_df_reddit['most_positive_token'] = reddit_st_ratings['most_positive_token'].apply(lambda x: replace_tokens(x, bias_dict))
filtered_df_news['most_positive_token'] = news_st_ratings['most_positive_token'].apply(lambda x: replace_tokens(x, bias_dict))

filtered_df_youtube = filtered_df_youtube[filtered_df_youtube['most_positive_token'].isin(bias_dict.values())]
filtered_df_reddit = filtered_df_reddit[filtered_df_reddit['most_positive_token'].isin(bias_dict.values())]
filtered_df_news = filtered_df_news[filtered_df_news['most_positive_token'].isin(bias_dict.values())]
# Sort the filtered DataFrame by positive_score in descending order
top_videos = filtered_df_youtube.drop_duplicates(['title']).sort_values(by="positive_score", ascending=False).head(3)
top_videos = top_videos[['url','title']]

top_reddit_posts = filtered_df_reddit.drop_duplicates(['Title']).sort_values(by="positive_score", ascending=False).head(3)
top_reddit_posts = top_reddit_posts[['Title','URL']]

top_news_articles = filtered_df_news.drop_duplicates(['title']).sort_values(by="positive_score", ascending=False).head(3)
top_news_articles = filtered_df_news[['title']]


print(top_reddit_posts)
print(top_videos)
print(top_news_articles)

                                                 Title  \
195     Bitcoin Account Abstraction (AA) on Supernova.   
32                                        Bitcoin Apps   
78   If you bought $100 worth of Bitcoin every time...   

                                                   URL  
195  https://www.reddit.com/r/defi/comments/1gsb18o...  
32   https://www.reddit.com/r/CryptoMarkets/comment...  
78                 https://i.redd.it/8h6m8jbm384e1.png  
                                             url  \
574  https://www.youtube.com/watch?v=lSfqPj7PpVc   
529  https://www.youtube.com/watch?v=DTmIeGp7VDo   
95   https://www.youtube.com/watch?v=r54qvZZqeeE   

                                                 title  
574  Cryptopia: Bitcoin, Blockchains, and the Futur...  
529     Why PEPE Is Up... 🤩 PEPE Crypto Token Analysis  
95   Buy The Polygon Matic Rally? 🤩 POL Crypto Toke...  
Empty DataFrame
Columns: [title]
Index: []
