In [None]:
import pandas as pd
import json

# Load the JSON file using the new file structure
with open('../data/raw/crypto_tweets_october_2020.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print(f"Initial dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

In [41]:
relevant_keywords = {
    '$DOT', 'Polkadot', '$XTZ', 'Stellar', 'Tezos'
}
# Filter rows where 'keywords' contains at least one relevant coin
df = df[df['keywords'].apply(lambda kws: any(kw in relevant_keywords for kw in kws) if isinstance(kws, list) else False)]
# Check the shape of the filtered DataFrame
print(f"Filtered DataFrame shape: {df_filtered.shape}")

Filtered DataFrame shape: (0, 11)


In [None]:
# Get all unique keywords (only if keywords column exists)
if 'keywords' in df.columns:
    all_keywords = set()
    for keywords in df['keywords']:
        if isinstance(keywords, list):
            all_keywords.update(keywords)
        elif keywords is not None:  # Handle single keyword strings
            all_keywords.add(keywords)
    
    print(f"Total unique keywords found: {len(all_keywords)}")
    print("Sample keywords:", list(all_keywords)[:20])
else:
    print("Cannot extract keywords - column not found")

KeyError: 'keywords'

In [None]:
# Check if keywords column exists and clean the data
if 'keywords' in df.columns:
    # Remove all the tweets that have no keywords
    df = df[df['keywords'].notna()]
    # Remove the tweets that have keyword "[]"
    df = df[df['keywords'].apply(lambda x: x != [] if isinstance(x, list) else True)]
    print(f"After cleaning: {df.shape}")
else:
    print("Keywords column not found. Available columns:", df.columns.tolist())

In [18]:

df.shape

(5708189, 11)

In [None]:
# Define relevant cryptocurrency keywords for October 2020
# Focus on major cryptocurrencies that were significant during that period
relevant_keywords = {
    '$BTC', '$ETH', '$BNB', '$ADA', '$LINK', '$LTC', '$XLM', '$XMR',
    'Bitcoin', 'Ethereum', 'Binance Coin', 'Cardano', 'ChainLink',
    'Litecoin', 'Stellar', 'Monero'
}

# Filter rows where 'keywords' contains at least one relevant coin
if 'keywords' in df.columns:
    df_filtered = df[df['keywords'].apply(
        lambda kws: any(kw in relevant_keywords for kw in kws) 
        if isinstance(kws, list) else (kws in relevant_keywords if kws else False)
    )]
    print(f"Filtered dataset shape: {df_filtered.shape}")
else:
    print("Cannot filter by keywords - column not found")
    df_filtered = df.copy()

In [21]:
df_filtered.shape

(1098008, 11)

In [22]:
import pandas as pd

 # Set sample size
target_sample_size = 250000

# Step 1: Explode keywords so each coin gets its own row
df_exploded = df_filtered.explode('keywords')

# Step 2: Filter only relevant keywords again (in case any slipped through)
df_exploded = df_exploded[df_exploded['keywords'].isin(relevant_keywords)]

# Step 3: Group by coin and get their counts
coin_counts = df_exploded['keywords'].value_counts()
coin_weights = (coin_counts / coin_counts.sum()) * target_sample_size
coin_weights = coin_weights.round().astype(int)  # Number of tweets per coin

# Step 4: Sample for each coin
sampled_list = []

for coin, n_samples in coin_weights.items():
    coin_df = df_exploded[df_exploded['keywords'] == coin]
    n_to_sample = min(n_samples, len(coin_df))  # Don't oversample
    sampled = coin_df.sample(n=n_to_sample, random_state=42)
    sampled_list.append(sampled)

# Step 5: Combine back and drop duplicates
df_stratified_sample = pd.concat(sampled_list).drop_duplicates(subset='id')

# Step 6: Reset index
df_stratified_sample = df_stratified_sample.reset_index(drop=True)

print(f"Final sample shape: {df_stratified_sample.shape}")

Final sample shape: (235421, 11)


In [28]:
# i only want keywords  ['Polkadot', '$DOT', 'Stellar', '$XTZ', 'Tezos']
relevant_keywords = {
    '$DOT', 'Polkadot', '$XTZ', 'Stellar', 'Tezos'
}
# Filter rows where 'keywords' contains at least one relevant coin
df = df[df['keywords'].apply(lambda kws: any(kw in relevant_keywords for kw in kws) if isinstance(kws, list) else False)]
# Check the shape of the filtered DataFrame
print(f"Filtered DataFrame shape: {df_filtered.shape}")


Filtered DataFrame shape: (0, 11)


In [25]:
import pandas as pd

# Set sample size
target_sample_size = 250000

# Step 1: Explode keywords so each coin gets its own row
df_exploded = df_filtered.explode('keywords')

# Step 2: Filter only relevant keywords again (just to be safe)
df_exploded = df_exploded[df_exploded['keywords'].isin(relevant_keywords)]

# Step 3: Group by coin and get weights
coin_counts = df_exploded['keywords'].value_counts()
coin_weights = (coin_counts / coin_counts.sum()) * target_sample_size
coin_weights = coin_weights.round().astype(int)

# Step 4: Sample for each coin, ensuring at least 100 if possible
sampled_list = []
missing_coins = []

for coin in relevant_keywords:
    coin_df = df_exploded[df_exploded['keywords'] == coin]
    if coin_df.empty:
        print(f"⚠️  No tweets found for: {coin}")
        missing_coins.append(coin)
        continue
    
    # Use weight if available, else small fallback
    n_samples = coin_weights.get(coin, 0)
    n_to_sample = min(max(n_samples, 100), len(coin_df))  # at least 100 if possible
    sampled = coin_df.sample(n=n_to_sample, random_state=42)
    sampled_list.append(sampled)

# Step 5: Combine and drop duplicates
df_stratified_sample = pd.concat(sampled_list).drop_duplicates(subset='id')

# Step 6: Reset index
df_stratified_sample = df_stratified_sample.reset_index(drop=True)

print(f"✅ Final sample shape: {df_stratified_sample.shape}")
if missing_coins:
    print(f"\n🔍 Missing coins from dataset: {missing_coins}")

⚠️  No tweets found for: Polkadot
⚠️  No tweets found for: $DOT
⚠️  No tweets found for: Stellar
⚠️  No tweets found for: $XTZ
⚠️  No tweets found for: Tezos
✅ Final sample shape: (235428, 11)

🔍 Missing coins from dataset: ['Polkadot', '$DOT', 'Stellar', '$XTZ', 'Tezos']


In [26]:
# give me the count of tweets for each coin in the sample
df_stratified_sample['keywords'].value_counts()



keywords
Bitcoin         94238
$BTC            44100
$ETH            38369
Ethereum        29711
$BNB             6712
$LINK            5145
$LTC             4083
$ADA             3648
ChainLink        2882
Litecoin         1730
$XMR             1698
Cardano          1458
$XLM              935
Monero            631
Binance Coin       88
Name: count, dtype: int64

In [29]:
df_stratified_sample

Unnamed: 0,_id,name,content,id,text,keywords,timestamp_ms,followers_count,urls,is_retweet,retweeted_id
0,{'$oid': '5f95004c5a40375671cec3b3'},,,1.320222e+18,@IvanOnTech #Ethereum is like discovering a ne...,Ethereum,1603600361135,,,,
1,{'$oid': '5f94b6745a40375671ccad59'},,,1.320142e+18,@CryptoFun_ID @Xplosive_ETH - What deflationar...,Ethereum,1603581434565,,,,
2,{'$oid': '5f9550015a40375671d0b41c'},,,1.320307e+18,@FBI\n \n@SEC_Enforcement\n please investigate...,Ethereum,1603620718763,,,,
3,{'$oid': '5f943b625a40375671c82c94'},,,1.320010e+18,RT @RewardPortal_: The Trillion dollar printer...,Ethereum,1603549997086,,,,
4,{'$oid': '5f9848d8228456f5a5a0ef73'},,,1.320931e+18,RT @fonship: Wow $ETH\nhttps://t.co/HPcllAR0Fy,Ethereum,1603769481059,,,,
...,...,...,...,...,...,...,...,...,...,...,...
235423,{'$oid': '5f93f8925a40375671c61075'},,,1.319939e+18,RT @Bloqport: Don’t forget: \n\n$50 Bitcoin Gi...,$LINK,1603532931581,,,,
235424,{'$oid': '5f935b825a40375671c19bf2'},,,1.319770e+18,@ChainLinkGod @KeeperOfLink @ChainlinkoracIe I...,$LINK,1603492675517,,,,
235425,{'$oid': '5f94471a5a40375671c8a46c'},,,1.320022e+18,in next 2 months ~3x for $link quite possible.,$LINK,1603552761108,,,,
235426,{'$oid': '5f930f525a40375671be94f6'},,,1.319688e+18,#SERGS token is on mainnet and I'm excited to ...,$LINK,1603473224077,,,,
