# FinTech Sentiment Analysis - Exploratory Data Analysis (EDA)

This notebook focuses on:
1.  **Data Inspection**: Understanding the class balance and language distribution.
2.  **Length Analysis**: Checking token counts to optimize model max_length.
3.  **Sample Predictions**: Quick interaction with the trained model.

**Note**: Core training logic is in `src/train_model.py` for reproducibility.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))
from preprocess import clean_text

## 1. Load Data

In [None]:
data_files = ['../data/social_en.jsonl', '../data/news_sw.jsonl']
data = []
import json

for file in data_files:
    if os.path.exists(file):
        with open(file, 'r') as f:
            for line in f:
                data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()

## 2. Class Balance Analysis

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='label', hue='language', data=df)
plt.title('Sentiment Distribution by Language')
plt.show()

## 3. Text Length Analysis

In [None]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
sns.histplot(data=df, x='word_count', hue='language', kde=True)
plt.title('Word Count Distribution')
plt.show()

## 4. Inference Test (Requires trained model)

In [None]:
from predict import SentimentPredictor

# Only run if model exists
if os.path.exists('../models/sentiment_model'):
    predictor = SentimentPredictor('../models/sentiment_model')
    
    sample_text = "Inflation is rising too fast."
    print(predictor.predict(sample_text))
else:
    print("Model not found. Run src/train_model.py first.")