In [1]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader

# ---- Config ----
DATA_FILE = "/content/data_full.json"
OUTPUT_DIR = "eda_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---- Load data ----
with open(DATA_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Expecting data keys like 'train', 'val', 'test'
records = []
for split_name in ['train', 'val', 'test']:
    for rec in data[split_name]:
        records.append({
            'split': split_name,
            'text': rec[0],
            'intent': rec[1]
        })

df = pd.DataFrame(records)
print(f"Loaded {len(df)} samples")
print(df['split'].value_counts())

# ---- Basic stats ----
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

print(df.groupby('split')[['char_count','word_count']].describe())

# Distribution of classes (intents)
intent_counts = df[df['split']=='train']['intent'].value_counts()
plt.figure(figsize=(12,6))
sns.histplot(intent_counts, bins=50)
plt.title("Train-split: Samples per Intent Class")
plt.xlabel("Number of samples")
plt.ylabel("Count of intent classes")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "train_intent_counts_hist.png"))
plt.close()

print("Top 5 intents by sample count (train):")
print(intent_counts.head(5))
print("Bottom 5 intents by sample count (train):")
print(intent_counts.tail(5))

plt.figure(figsize=(12,6))
sns.histplot(df['word_count'], bins=100)
plt.title("Distribution of Word Counts (All splits)")
plt.xlabel("Words per sample")
plt.ylabel("Count of samples")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "word_count_distribution.png"))
plt.close()


oos_label = 'oos'
df['is_oos'] = (df['intent']==oos_label)
plt.figure(figsize=(12,6))
sns.boxplot(x='is_oos', y='word_count', data=df)
plt.title("Word count: In-domain vs OOS")
plt.xlabel("Is OOS?")
plt.ylabel("Words per sample")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "word_count_oos_vs_indomain.png"))
plt.close()

from tqdm import tqdm
counter = Counter()
for txt in tqdm(df[df['split']=='train']['text']):
    tokens = txt.split()
    counter.update(tokens)

most_common = counter.most_common(50)
print("Most common tokens (train):", most_common[:20])




Loaded 22500 samples
split
train    15000
test      4500
val       3000
Name: count, dtype: int64
      char_count                                                      \
           count       mean        std  min   25%   50%   75%    max   
split                                                                  
test      4500.0  39.308889  15.760182  2.0  28.0  37.0  48.0  125.0   
train    15000.0  39.906067  15.262904  2.0  29.0  38.0  49.0  136.0   
val       3000.0  39.825667  16.580860  2.0  28.0  37.0  49.0  114.0   

      word_count                                                 
           count      mean       std  min  25%  50%   75%   max  
split                                                            
test      4500.0  8.191111  3.242021  1.0  6.0  8.0  10.0  25.0  
train    15000.0  8.339200  3.191868  1.0  6.0  8.0  10.0  28.0  
val       3000.0  8.319333  3.434330  1.0  6.0  8.0  10.0  24.0  
Top 5 intents by sample count (train):
intent
translate          100
tran

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 15000/15000 [00:00<00:00, 220720.31it/s]

Most common tokens (train): [('my', 5528), ('i', 5437), ('to', 5428), ('the', 3992), ('you', 2966), ('a', 2531), ('for', 2520), ('what', 2506), ('me', 2420), ('is', 2235), ('how', 2065), ('do', 1912), ('on', 1797), ('can', 1793), ('in', 1647), ('of', 1466), ('need', 1306), ('please', 1134), ('card', 1103), ('tell', 998)]





In [2]:
# =====================================
# ðŸ”¹ PREPROCESSING + ENHANCED EDA
# =====================================
import re
import string
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ---- Clean text function ----
def clean_text(text):
    text = text.lower()                              # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)        # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)              # remove punctuation & numbers
    text = re.sub(r"\s+", " ", text).strip()          # normalize spaces
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

df['clean_text'] = df['text'].apply(clean_text)
print("âœ… Text cleaning done. Example:")
print(df[['text','clean_text']].head(3))

# ---- Basic comparisons ----
df['clean_word_count'] = df['clean_text'].apply(lambda x: len(x.split()))
print("\nWord count comparison (raw vs cleaned):")
print(df[['word_count','clean_word_count']].describe())

plt.figure(figsize=(12,6))
sns.histplot(df['clean_word_count'], bins=100, color='teal', alpha=0.7)
plt.title("Distribution of Cleaned Word Counts (All splits)")
plt.xlabel("Words per sample (cleaned)")
plt.ylabel("Count of samples")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "word_count_cleaned_distribution.png"))
plt.close()

# ---- Word Cloud for overall corpus ----
from wordcloud import STOPWORDS
text_corpus = " ".join(df[df['split']=='train']['clean_text'].values)
wordcloud = WordCloud(width=1200, height=600, background_color="white",
                      stopwords=STOPWORDS, max_words=200).generate(text_corpus)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud â€” All Training Texts (Cleaned)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "wordcloud_train_cleaned.png"))
plt.close()

# ---- Optional: Word clouds for top few intents ----
top_intents = df[df['split']=='train']['intent'].value_counts().head(3).index.tolist()
for intent in top_intents:
    text_joined = " ".join(df[(df['split']=='train') & (df['intent']==intent)]['clean_text'].values)
    wc = WordCloud(width=1200, height=600, background_color="white").generate(text_joined)
    plt.figure(figsize=(12,6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud â€” Intent: {intent}")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, f"wordcloud_{intent}.png"))
    plt.close()

print(f"âœ… Word clouds generated for top intents in {OUTPUT_DIR}")

# ---- TF-IDF Analysis ----
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df[df['split']=='train']['clean_text'])
feature_names = np.array(tfidf.get_feature_names_out())

# Find top words per intent (based on mean TF-IDF)
top_words_per_intent = {}
train_df = df[df['split']=='train']
for intent in train_df['intent'].unique():
    subset = train_df[train_df['intent']==intent]
    X_sub = tfidf.transform(subset['clean_text'])
    mean_tfidf = np.asarray(X_sub.mean(axis=0)).ravel()
    top_idx = mean_tfidf.argsort()[-10:][::-1]
    top_words = feature_names[top_idx]
    top_words_per_intent[intent] = top_words

print("\nTop words for a few sample intents:")
for k in list(top_words_per_intent.keys())[:5]:
    print(f"{k}: {', '.join(top_words_per_intent[k])}")

# ---- TF-IDF global importance plot ----
mean_tfidf_global = np.asarray(X_tfidf.mean(axis=0)).ravel()
top_global_idx = mean_tfidf_global.argsort()[-30:][::-1]
plt.figure(figsize=(12,6))
sns.barplot(x=mean_tfidf_global[top_global_idx], y=feature_names[top_global_idx])
plt.title("Top 30 Most Informative Words (TF-IDF Global, Train)")
plt.xlabel("Mean TF-IDF score")
plt.ylabel("Word")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "tfidf_top_words.png"))
plt.close()

print("âœ… TF-IDF analysis complete.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


âœ… Text cleaning done. Example:
                                                text  \
0  what expression would i use to say i love you ...   
1  can you tell me how to say 'i do not speak muc...   
2  what is the equivalent of, 'life is good' in f...   

                              clean_text  
0  expression would use say love italian  
1    tell say speak much spanish spanish  
2            equivalent life good french  

Word count comparison (raw vs cleaned):
         word_count  clean_word_count
count  22500.000000      22500.000000
mean       8.306933          4.052222
std        3.235613          1.713561
min        1.000000          0.000000
25%        6.000000          3.000000
50%        8.000000          4.000000
75%       10.000000          5.000000
max       28.000000         14.000000
âœ… Word clouds generated for top intents in eda_outputs

Top words for a few sample intents:
translate: say, hello, french, spanish, translate, would, thank, english, love, goodbye
trans

In [3]:
# List all unique intent classes
intent_classes = sorted(df['intent'].unique())
print(f"Total number of intent classes: {len(intent_classes)}\n")

# Display them neatly
for i, intent in enumerate(intent_classes, 1):
    print(f"{i:3d}. {intent}")


Total number of intent classes: 150

  1. accept_reservations
  2. account_blocked
  3. alarm
  4. application_status
  5. apr
  6. are_you_a_bot
  7. balance
  8. bill_balance
  9. bill_due
 10. book_flight
 11. book_hotel
 12. calculator
 13. calendar
 14. calendar_update
 15. calories
 16. cancel
 17. cancel_reservation
 18. car_rental
 19. card_declined
 20. carry_on
 21. change_accent
 22. change_ai_name
 23. change_language
 24. change_speed
 25. change_user_name
 26. change_volume
 27. confirm_reservation
 28. cook_time
 29. credit_limit
 30. credit_limit_change
 31. credit_score
 32. current_location
 33. damaged_card
 34. date
 35. definition
 36. direct_deposit
 37. directions
 38. distance
 39. do_you_have_pets
 40. exchange_rate
 41. expiration_date
 42. find_phone
 43. flight_status
 44. flip_coin
 45. food_last
 46. freeze_account
 47. fun_fact
 48. gas
 49. gas_type
 50. goodbye
 51. greeting
 52. how_busy
 53. how_old_are_you
 54. improve_credit_score
 55. income
 56. i