In [4]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached click-8.2.1-py3-none-any.whl (102 kB)
Installing collected packages: click, nltk

   ---------------------------------------- 0/2 [click]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- --

In [1]:
tweets = [
    "We're going to pass a massive infrastructure bill that will create millions of jobs and rebuild America.",
    "If China doesn't make a better trade deal, we're going to impose new tariffs on their imports.",
    "Our economy is strong and the stock market continues to break records—unbelievable success!",
    "The Federal Reserve has been doing great work supporting the markets—I commend their patience and action.",
    "THIS IS A GREAT TIME TO BUY!!! DJT",
    "Record low unemployment, rising wages—America's on the move!",
    "U.S. manufacturing is growing strong—just look at these numbers!",
    "We must cut corporate tax rates to help American businesses flourish.",
    "Feeling optimistic about progress in trade talks—we're getting close!",
    "The Fed's interest rate hikes are hurting workers and businesses—time to rethink policy."
]

tweets

["We're going to pass a massive infrastructure bill that will create millions of jobs and rebuild America.",
 "If China doesn't make a better trade deal, we're going to impose new tariffs on their imports.",
 'Our economy is strong and the stock market continues to break records—unbelievable success!',
 'The Federal Reserve has been doing great work supporting the markets—I commend their patience and action.',
 'THIS IS A GREAT TIME TO BUY!!! DJT',
 "Record low unemployment, rising wages—America's on the move!",
 'U.S. manufacturing is growing strong—just look at these numbers!',
 'We must cut corporate tax rates to help American businesses flourish.',
 "Feeling optimistic about progress in trade talks—we're getting close!",
 "The Fed's interest rate hikes are hurting workers and businesses—time to rethink policy."]

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    test_sentence = " ".join(cleaned_tokens)
    return test_sentence

processed_tweets = list(map(preprocess, tweets))
processed_tweets

ModuleNotFoundError: No module named 'nltk'

In [3]:
FINANCIAL_GROUPS = {
    "Financial Markets": [
        "stocks", "oil", "dollar", "shares", "bonds", 
        "future", "forecast", "trading", "investment",
        "equities", "crude", "currency", "yield"
    ],
    "Corporate Sector": [
        "sales", "tech", "pharma", "steel", "manufacturing",
        "earnings", "company", "revenue", "production",
        "biotech", "automotive", "industrial"
    ],
    "Macro Finance": [
        "fx", "rates", "commodities", "solar", "wind",
        "gas", "bank", "credit", "debt", "fed", "fund",
        "interest", "inflation", "energy", "renewables"
    ]
}

In [5]:
import pandas as pd
from transformers import pipeline, set_seed
import torch

# Set seed for reproducibility
set_seed(42)

# 1. Define your financial groups (expanded for better coverage)
FINANCIAL_GROUPS = {
    "Financial Markets": [
        "stocks", "oil", "dollar", "shares", "bonds", 
        "future", "forecast", "trading", "investment",
        "equities", "crude", "currency", "yield", "portfolio",
        "derivatives", "commodity", "index"
    ],
    "Corporate Sector": [
        "sales", "tech", "pharma", "steel", "manufacturing",
        "earnings", "company", "revenue", "production",
        "biotech", "automotive", "industrial", "profit",
        "merger", "acquisition", "dividend"
    ],
    "Macro Finance": [
        "fx", "rates", "commodities", "solar", "wind",
        "gas", "bank", "credit", "debt", "fed", "fund",
        "interest", "inflation", "energy", "renewables",
        "policy", "regulation", "liquidity"
    ]
}

# 2. Load models with error handling
try:
    classifier = pipeline(
        "zero-shot-classification",
        model="MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33",
        device=0 if torch.cuda.is_available() else -1
    )
    
    sentiment = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
except Exception as e:
    print(f"Error loading models: {e}")
    print("Try: pip install --upgrade transformers torch")
    exit()

# 3. Process tweets with progress tracking
from tqdm import tqdm

results = []
for sentence in tqdm(processed_tweets, desc="Analyzing tweets"):
    try:
        # Topic Classification
        topic_result = classifier(
            sentence,
            candidate_labels=list(FINANCIAL_GROUPS.keys()),
            hypothesis_template="This financial news concerns {}."
        )
        top_topic = topic_result['labels'][0]
        
        # Sentiment Analysis
        sentiment_result = sentiment(sentence)[0]
        
        # Find matched keywords
        matched_keywords = [
            kw for kw in FINANCIAL_GROUPS[top_topic] 
            if kw.lower() in sentence.lower()
        ]
        
        results.append({
            "text": sentence,
            "topic": top_topic,
            "topic_confidence": round(topic_result['scores'][0], 4),
            "matched_keywords": ", ".join(matched_keywords[:3]) if matched_keywords else "None",
            "sentiment": sentiment_result['label'].upper(),
            "sentiment_score": round(sentiment_result['score'], 4)
        })
    except Exception as e:
        print(f"Error processing: '{sentence[:50]}...' - {str(e)}")
        continue

# 4. Create DataFrame with type optimization
results_df = pd.DataFrame(results).astype({
    'topic': 'category',
    'sentiment': 'category'
})

# 5. Enhanced analysis
print("\n=== Analysis Summary ===")
print(f"Processed {len(results_df)}/{len(processed_tweets)} tweets successfully")
print("\nTopic Distribution:")
print(results_df['topic'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')

print("\nSentiment Distribution:")
print(results_df['sentiment'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%')

print("\nSentiment by Topic:")
print(pd.crosstab(
    results_df['topic'], 
    results_df['sentiment'],
    margins=True,
    margins_name="Total"
))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu
Device set to use cpu
Analyzing tweets:   0%|          | 0/10 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Analyzing tweets: 100%|██████████| 10/10 [00:38<00:00,  3.85s/it]


=== Analysis Summary ===
Processed 10/10 tweets successfully

Topic Distribution:
topic
Financial Markets    90.0%
Corporate Sector     10.0%
Name: proportion, dtype: object

Sentiment Distribution:
sentiment
POSITIVE    60.0%
NEGATIVE    40.0%
Name: proportion, dtype: object

Sentiment by Topic:
sentiment          NEGATIVE  POSITIVE  Total
topic                                       
Corporate Sector          0         1      1
Financial Markets         4         5      9
Total                     4         6     10



