# INTERACTIVE DASHBOARD

> This notebook needs to be executed in a folder with all his models. Find this folder in here: [Google Drive folder](https://drive.google.com/drive/folders/1YgKDzirb00ESdaCAk8BoTtFw0bxisc8h?usp=sharing)

In [1]:
import pandas as pd
import pickle
import gensim

# Loading Data/Models

In [2]:
with open('df_train_with_fasttext.pkl', 'rb') as file:
    data = pickle.load(file)
from gensim.models import LdaModel
from gensim.corpora import Dictionary

lda_model = LdaModel.load("best_lda_model.gensim")
dictionary = Dictionary.load("lda_dictionary.dict")


In [3]:
lda_model.show_topics(num_topics=-1, num_words=10, log=False, formatted=True)

[(0,
  '0.015*"research" + 0.013*"health" + 0.009*"development" + 0.008*"clinical" + 0.008*"include" + 0.007*"care" + 0.006*"medical" + 0.006*"team" + 0.006*"scientist" + 0.006*"laboratory"'),
 (1,
  '0.016*"status" + 0.014*"employment" + 0.013*"opportunity" + 0.011*"disability" + 0.011*"gender" + 0.010*"equal" + 0.010*"information" + 0.009*"applicant" + 0.009*"veteran" + 0.009*"protect"'),
 (2,
  '0.034*"business" + 0.013*"process" + 0.012*"requirement" + 0.012*"project" + 0.011*"team" + 0.010*"skill" + 0.010*"analysis" + 0.010*"management" + 0.010*"ability" + 0.009*"data"'),
 (3,
  '0.010*"system" + 0.008*"management" + 0.008*"include" + 0.008*"support" + 0.008*"require" + 0.008*"position" + 0.008*"provide" + 0.007*"program" + 0.007*"year" + 0.007*"service"'),
 (4,
  '0.016*"team" + 0.011*"data" + 0.010*"business" + 0.009*"product" + 0.008*"company" + 0.008*"build" + 0.007*"customer" + 0.007*"help" + 0.007*"analytic" + 0.007*"drive"'),
 (5,
  '0.029*"data" + 0.013*"year" + 0.013*"eng

In [4]:
import numpy as np
def get_document_topic_vector(lda_model, bow):
    topic_dist = lda_model.get_document_topics(bow, minimum_probability=0)
    return np.array([prob for _, prob in sorted(topic_dist, key=lambda x: x[0])])

In [5]:
doc = data['processed_text_filtered'][0]               
bow = dictionary.doc2bow(doc.split())      
vec = get_document_topic_vector(lda_model, bow)

In [6]:
import spacy
import re
import contractions
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
stopwords_en = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
nlp = spacy.load('en_core_web_md')
nlp.disable_pipe('parser')
nlp.disable_pipe('ner')
valid_POS = set(['VERB', 'NOUN', 'ADJ', 'PROPN'])

def preprocess_text(text, use_stemmer=False):
    #text = BeautifulSoup(text, 'lxml').get_text()
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = contractions.fix(text)
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_alpha and token.pos_ in valid_POS and not token.is_stop:
            lemma = token.lemma_.lower()
            if use_stemmer:
                lemma = stemmer.stem(lemma)
            tokens.append(lemma)

    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Iker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def get_most_probable_topic(text, lda_model, dictionary, preprocess):
    tokens = preprocess(text)
    bow = dictionary.doc2bow(tokens.split())
    topic_probs = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_probs = sorted(topic_probs, key=lambda x: -x[1])
    top_topic = topic_probs[0]
    return {
        "topic_id": top_topic[0],
        "probability": top_topic[1],
        "keywords": lda_model.print_topic(top_topic[0])
    }

In [8]:
def get_top_topics(text, lda_model, dictionary, preprocess, top_n=3):
    tokens = preprocess(text)
    bow = dictionary.doc2bow(tokens.split())
    topic_probs = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_probs = sorted(topic_probs, key=lambda x: -x[1])
    top_topics = topic_probs[:top_n]

    return [
        {
            "topic_id": topic[0],
            "probability": topic[1],
            "keywords": lda_model.print_topic(topic[0])
        }
        for topic in top_topics
    ]

In [9]:
# This helps not to be biased by the previous TF-IDF filtering.
texts_bow = data['processed_text'].dropna().apply(lambda x: x.split()).tolist()
dictionary = Dictionary(texts_bow)
# filter extremes (this is where you remove rare and too frequent words → clean BoW version)
dictionary.filter_extremes(no_below=3, no_above=0.8)
# create BoW corpus
corpus = [dictionary.doc2bow(text) for text in texts_bow]

In [10]:
import gensim.downloader as api
# Load pretrained FastText word vectors (300 dimensions)
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [11]:
def cosine_similarity(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm1 * norm2)

def get_most_similar_docs(input_text, df, vectorize, vector_col='vector', text_col='text', top_n=5):
    input_vec = np.array(vectorize(input_text))
    similarities = []

    for i, row in df.iterrows():
        doc_vec = np.array(row[vector_col])
        score = cosine_similarity(input_vec, doc_vec)
        similarities.append((i, score))

    similarities.sort(key=lambda x: -x[1])  # descending order

    top_docs = [{
        "index": idx,
        "score": score,
        "text": df.loc[idx, text_col]
    } for idx, score in similarities[:top_n]]

    return top_docs


def vectorize(doc):
    vectors = [fasttext_model[word] for word in doc.split() if word in fasttext_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(fasttext_model.vector_size)  # safer than hardcoding 300



In [12]:
top_docs = get_most_similar_docs(
    input_text = data.iloc[0]['clean_text'],  
    df = data,
    vectorize = vectorize,            
    vector_col = 'fasttext_vector',
    text_col = 'clean_text',
    top_n = 5
)

In [13]:
top_docs

[{'index': 8816,
  'score': 0.87493986,
  'text': 'Agile Business Analyst 6+ month contract Jersey City, NJ YOU.S. Citizens and GC candidates are encouraged to apply. Please note this is for W2 pay only. 3rd party candidates are not accepted at this time. Please send resumes to Aleta at Aletaforbestc.com. Requirements and Responsibilities NOTE This person will be bringing Agile experience to this team as a value-add. Required Skills Minimum of 3 year Financial Services industry experience. Preferred This should be in support of financial trading or account management applications Minimum of 4 to 5 years as a Business Analyst. Required Wealth Management Highly Preferred. Financial trading experience is also preferred. Some experience in Project Management would be highly beneficial. Responsibilities Experience with various Business Analysis Methodologies including different agile disciplines. Experience with Agile methodology should include writing user stories, managing scrum calls, pr

# Regression

In [14]:
import pandas as pd
## Load it later
df_train = pd.read_pickle("df_train_with_all_vector_embe.pkl")
df_test = pd.read_pickle("df_test_with_all_vector_embe.pkl")
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def preprocess_df(df_train, df_test, df_vector_column: str): 
    # structured features 
    # (we drop Job Title and Description as its obvious)
    categorical_cols = ['Company Name', 'Location', 'Headquarters', 'Size', 'Type of ownership',
                        'Industry', 'Sector', 'Revenue']
    # 'Low_salary_estimate (K)' removed because it would make no sense
    numerical_cols = ['Rating', ]

    # target variable
    target_col = 'High_salary_estimate (K)'

    # Replace rare categories in both train and test with 'Other'
    def replace_rare(df_train, df_test, col):
        freq = df_train[col].value_counts()
        rare_vals = freq[freq == 1].index
        df_train[col] = df_train[col].replace(rare_vals, 'Other')
        df_test[col] = df_test[col].replace(rare_vals, 'Other')  # align with train
        return df_train, df_test
        
    for col in categorical_cols:
        df_train, df_test = replace_rare(df_train, df_test, col)

    # Build feature sets
    X_struct_train = df_train[categorical_cols + numerical_cols].copy()
    X_struct_train[df_vector_column] = df_train[df_vector_column]
    y_train = df_train[target_col]

    X_struct_test = df_test[categorical_cols + numerical_cols].copy()
    X_struct_test[df_vector_column] = df_test[df_vector_column]
    y_test = df_test[target_col]


    # custom transformer to pass FastText matrix into the pipeline
    class VectorEmbedder(BaseEstimator, TransformerMixin):
        def __init__(self, vector_column):
            self.vector_column = vector_column

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            # Assumes each row in the column is a 1D NumPy array
            return np.vstack(X[self.vector_column].values)

    # preprocess structured features
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    numeric_transformer = StandardScaler()

    preprocessor = ColumnTransformer([
        ("cat", categorical_transformer, categorical_cols),
        ("num", numeric_transformer, numerical_cols)
    ])

    # combine structured features + vector text
    combined_features = FeatureUnion([
        ("structured", preprocessor),
        ("vector_embeddings", VectorEmbedder(vector_column=df_vector_column))
    ])

    # Transform full train/test sets
    X_train_combined = combined_features.fit_transform(X_struct_train)
    X_test_combined = combined_features.transform(X_struct_test)

    return X_train_combined, X_test_combined, y_train, y_test

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
class SimpleMLP(nn.Module):
    def __init__(self, input_size, dropout_rate):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        x = self.dropout(F.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x) 
        return x
    
X_train_combined, X_test_combined, y_train, y_test = preprocess_df(df_train, df_test, df_vector_column = "fasttext_vector")

X_val, X_test, y_val, y_test = train_test_split(X_test_combined, y_test, test_size=0.5, random_state=42)

X_train_tensor = torch.tensor(X_train_combined, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 128 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # i only need shuffled in train because we do not need order to predict in val or test set
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create model, loss function and optimizer
input_size = X_train_tensor.shape[1]  
model_mlp = SimpleMLP(input_size=input_size, dropout_rate=0.4).to(device)
optimizer = torch.optim.AdamW(model_mlp.parameters(), lr=0.001, weight_decay=0.01) # higher weight_decay, stronger regularization
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-5)# T_max = Number of steps (usually epochs) to anneal over, #eta_min =Minimum learning rate
criterion = nn.MSELoss()
myMOdel = model_mlp.load_state_dict(torch.load("best_mlp_fasttext.pt", weights_only=True))

Using device: cpu


In [15]:
def preprocess_single_row(df_row, combined_features, categorical_cols, numerical_cols, df_vector_column):
    if isinstance(df_row, pd.Series):
        df_row = df_row.to_frame().T

    X_struct = df_row[categorical_cols + numerical_cols].copy()
    X_struct[df_vector_column] = df_row[df_vector_column]

    return combined_features.transform(X_struct).squeeze()


In [16]:
categorical_cols = ['Company Name', 'Location', 'Headquarters', 'Size', 'Type of ownership',
                        'Industry', 'Sector', 'Revenue']
    # 'Low_salary_estimate (K)' removed because it would make no sense
numerical_cols = ['Rating', ]

    # target variable
target_col = 'High_salary_estimate (K)'
df_vector_column = "fasttext_vector"
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer([
        ("cat", categorical_transformer, categorical_cols),
        ("num", numeric_transformer, numerical_cols)
    ])
class VectorEmbedder(BaseEstimator, TransformerMixin):
        def __init__(self, vector_column):
            self.vector_column = vector_column

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            # Assumes each row in the column is a 1D NumPy array
            return np.vstack(X[self.vector_column].values)
        
combined_features = FeatureUnion([
        ("structured", preprocessor),
        ("vector_embeddings", VectorEmbedder(vector_column=df_vector_column))
])

X_struct_train = df_train[categorical_cols + numerical_cols].copy()
X_struct_train[df_vector_column] = df_train[df_vector_column]
X_train_combined = combined_features.fit_transform(X_struct_train)

In [17]:
to_test = preprocess_single_row(data.iloc[[0]], combined_features, categorical_cols, numerical_cols, df_vector_column)
to_test.shape


(3017,)

In [18]:
# Set model to eval mode
model_mlp.eval()

# Pick a single sample and prepare it
sample_input = torch.tensor(to_test, dtype=torch.float32).unsqueeze(0).to(device)

# Predict
with torch.no_grad():
    prediction = model_mlp(sample_input)

print("Prediction of High_salary_estimate (K):", round(prediction.item(), 2))

Prediction of High_salary_estimate (K): 126.03


# Dashboard

In [25]:
import dash
from dash import html, dcc, Output, Input
import plotly.graph_objs as go
import numpy as np
import pandas as pd
import torch
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import base64
from io import BytesIO


app = dash.Dash(__name__)

categorical_cols = ['Company Name', 'Location', 'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue']
numerical_cols = ['Rating']
df_vector_column = 'fasttext_vector'
all_feature_names = categorical_cols + numerical_cols + [df_vector_column]

# Layout of the app
app.layout = html.Div([
    html.Div([
        html.H1("Data Careers Dashboard", style={
            'text-align': 'center',
            'color': 'white',
            'margin': '0'
        }),
        html.Div([
            html.H3(
                "Enter a job offer description to discover its most relevant topics. You will see the top topic probabilities along with their main keywords, and a list of the most semantically similar job postings.",
                style={
                    'color': 'white',
                    'margin': '15px 0 20px 0',
                    'font-weight': 'normal',
                    'line-height': '1.4',
                    'text-align': 'center'
                }
            )
        ], style={
            'max-width': '800px',
            'margin': '0 auto'
        })
    ], style={
        'background-color': '#003366',
        'padding': '40px 20px',
        'margin-bottom': '20px',
        'font-family': '"Times New Roman", Times, serif',
        'border-top-left-radius': '20px',
        'border-top-right-radius': '20px'
    })
    ,

    html.Div("Job Offer Topics, Salary Prediction and Recommendations", style={
        'margin': '10px 0 20px 0',
        'text-align': 'center',
        'font-size': '20px',
        'font-weight': 'bold',
        'color': '#003366',
        'border-top': '2px solid #003366',
        'border-bottom': '2px solid #003366',
        'padding': '20px 0'
        
    }),

    html.Div([
        html.Div([
            html.Label(col),
            dcc.Input(
                id=f'input-{col.lower().replace(" ", "-")}',
                type='text',
                placeholder=f'Enter {col}',
                style={'width': '100%'}
            )
        ], style={'padding': '5px', 'width': '24%'})
        for col in categorical_cols + numerical_cols
    ], style={
        'display': 'flex',
        'flex-wrap': 'wrap',
        'gap': '10px',
        'padding': '0 40px',
        'margin-bottom': '20px'
    }),

    html.Div([
        html.Div([
            dcc.Textarea(
                id='input-text',
                placeholder='Enter your job description here...',
                style={'width': '100%', 'height': 250}
            ),
            html.Button("Predict", id='submit-btn', n_clicks=0, style={'margin-top': '0px'})
        ], style={'width': '50%', 'padding': '5px', 'box-sizing': 'border-box'}),

        html.Div([
            html.Div(id='output-topic')
        ], style={'width': '50%', 'padding': '0px 20px 20px 20px', 'box-sizing': 'border-box'})
    ], style={
        'display': 'flex',
        'justify-content': 'space-between',
        'padding': '0 40px'
    }),

    html.Div(id='salary-prediction-output', style={
        'textAlign': 'center',
        'fontSize': '22px',
        'color': '#003366',
        'padding': '20px'
    }),

    html.Div([
        html.H4("Most Similar Job Offers:", style={'color': '#003366'}),
        html.Div(id='similar-docs-output'),

        html.Div("LDA Topic Understanding", style={
            'margin': '60px 0 20px 0',
            'text-align': 'center',
            'font-size': '20px',
            'font-weight': 'bold',
            'color': '#003366',
            'border-top': '2px solid #003366',
            'border-bottom': '2px solid #003366',
            'padding': '20px 0'
        })
    ], style={
        'width': '100%',
        'padding': '0 40px',
        'box-sizing': 'border-box'
    }),

    html.Div([
        html.Iframe(
            src='/assets/lda_vis.html',
            style={'width': '80%', 'height': '800px', 'border': 'none'}
        )
    ], style={
        'display': 'flex',
        'justify-content': 'center',
        'width': '100%',
        'height': '850px',
        'overflow': 'hidden',
        'padding': '20px',
        'box-sizing': 'border-box'
    }),

    html.Div("Distinctive Words by Salary Range (TF-IDF Contrast) and Job Title WordCloud", style={
        'margin': '60px 0 20px 0',
        'text-align': 'center',
        'font-size': '20px',
        'font-weight': 'bold',
        'color': '#003366',
        'border-top': '2px solid #003366',
        'border-bottom': '2px solid #003366',
        'padding': '20px 0'
    }),

    html.Div([
        dcc.RangeSlider(
            id='salary-range-slider',
            min=int(data['High_salary_estimate (K)'].min()),
            max=int(data['High_salary_estimate (K)'].max()),
            step=1,
            value=[
                int(data['High_salary_estimate (K)'].quantile(0.25)),
                int(data['High_salary_estimate (K)'].quantile(0.75))
            ],
            marks={i: str(i) for i in range(
                int(data['High_salary_estimate (K)'].min()),
                int(data['High_salary_estimate (K)'].max()) + 1,
                20
            )},
            tooltip={"placement": "bottom", "always_visible": True}
        ),

        html.Div(id='range-display', style={'margin': '20px 0', 'textAlign': 'center'}),
        dcc.Graph(id='distinctive-words-graph'),

        html.H3("Most Common Job Titles", style={
            'textAlign': 'center',
            'marginTop': '40px',
            'color': '#003366'
        }),
        html.Img(id='wordcloud-img', style={
            'display': 'block',
            'margin': '0 auto',
            'maxWidth': '100%'
        })
    ], style={
        'margin-left': '40px',
        'margin-right': '40px'
    }),

    html.Footer("Made by Team: Iker Rosales Saiz, Cesar Álvarez-Cascos Hervías, Segio Vizcaíno Ferrer and Alejandro Ponce Fernández", style={
        'textAlign': 'center',
        'padding': '20px',
        'background-color': '#f0f0f0',
        'color': '#555',
        'margin-top': '40px',
        'fontStyle': 'italic'
    })
])


# -----------------------------
# Utility Functions
# -----------------------------

def get_top_topics(text, lda_model, dictionary, preprocess, top_n=3):
    tokens = preprocess(text)
    bow = dictionary.doc2bow(tokens.split())
    topic_probs = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_probs = sorted(topic_probs, key=lambda x: -x[1])
    return [{
        "topic_id": topic[0],
        "probability": topic[1],
        "keywords": lda_model.print_topic(topic[0])
    } for topic in topic_probs[:top_n]]

def cosine_similarity(vec1, vec2):
    norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm1 * norm2)

def get_most_similar_docs(input_text, df, vectorize, vector_col='vector', text_col='text', top_n=5):
    input_vec = np.array(vectorize(input_text))
    similarities = [
        (i, cosine_similarity(input_vec, np.array(row[vector_col])))
        for i, row in df.iterrows()
    ]
    similarities.sort(key=lambda x: -x[1])
    return [{
        "index": idx,
        "similarity": score,
        "text": df.loc[idx, text_col]
    } for idx, score in similarities[:top_n]]

def get_distinctive_words(df, salary_min, salary_max, top_n=20):
    A = df[(df['High_salary_estimate (K)'] >= salary_min) & (df['High_salary_estimate (K)'] <= salary_max)]
    B = df[~df.index.isin(A.index)]

    corpus_A = A['clean_text'].dropna().astype(str).tolist()
    corpus_B = B['clean_text'].dropna().astype(str).tolist()

    if len(corpus_A) < 3 or len(corpus_B) < 3:
        return pd.DataFrame(columns=['word', 'score'])

    vectorizer = TfidfVectorizer(stop_words='english', min_df=3, token_pattern=r'\b\w{4,}\b')
    vectorizer.fit(corpus_A + corpus_B)

    tfidf_A = vectorizer.transform(corpus_A).mean(axis=0)
    tfidf_B = vectorizer.transform(corpus_B).mean(axis=0)

    tfidf_A = np.asarray(tfidf_A).flatten()
    tfidf_B = np.asarray(tfidf_B).flatten()

    diff = tfidf_A - tfidf_B
    words = np.array(vectorizer.get_feature_names_out())
    top_idx = np.argsort(diff)[-top_n:][::-1]

    return pd.DataFrame({'word': words[top_idx], 'score': diff[top_idx]})

def update_graph(salary_range):
    salary_min, salary_max = salary_range

    display_text = f"Showing distinctive words for jobs with salary between {salary_min}K and {salary_max}K"

    df_words = get_distinctive_words(data, salary_min, salary_max)

    if df_words.empty:
        fig = px.bar(title="Not enough data to compute distinctive words")
    else:
        fig = px.bar(df_words, x='word', y='score',
                     title=f"Top Distinctive Words ({salary_min}K–{salary_max}K)",
                     labels={'score': 'TF-IDF Difference'},
                     color='score', color_continuous_scale='Blues')
        fig.update_layout(
            plot_bgcolor='#ffffff',         # inside chart background
            paper_bgcolor='#ffffff',        # around the chart
            font=dict(color='#003366'),     # text color
            xaxis=dict(gridcolor='white', color='#003366'),
            yaxis=dict(gridcolor='white', color='#003366'),
            title=dict(font=dict(size=18, color='#003366'))
        )

    return display_text, fig

def generate_wordcloud(df):
    text = ' '.join(df['Job Title'].dropna().astype(str).tolist())

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        max_words=60,                  # fewer words
        prefer_horizontal=0.95,        # strongly prefer horizontal
        colormap='Blues',              # optional: color scheme
        contour_color='lightgray',     # optional: soft outline
        contour_width=1
    ).generate(text)

    buffer = BytesIO()
    wordcloud.to_image().save(buffer, format='PNG')
    encoded_image = base64.b64encode(buffer.getvalue()).decode()
    return f'data:image/png;base64,{encoded_image}'

# -----------------------------
# Callbacks
# -----------------------------

@app.callback(
    Output('range-display', 'children'),
    Output('distinctive-words-graph', 'figure'),
    Output('output-topic', 'children'),
    Output('similar-docs-output', 'children'),
    Output('salary-prediction-output', 'children'),
    Output('wordcloud-img', 'src'),
    Input('salary-range-slider', 'value'),
    Input('submit-btn', 'n_clicks'),
    Input('input-text', 'value'),
    *[Input(f'input-{col.lower().replace(" ", "-")}', 'value') for col in categorical_cols + numerical_cols]
)
def update_output(salary_range, n_clicks, text, *input_values):
    display_text, fig = update_graph(salary_range)

    # Filter data for wordcloud
    salary_min, salary_max = salary_range
    df_filtered = data[(data['High_salary_estimate (K)'] >= salary_min) & (data['High_salary_estimate (K)'] <= salary_max)]
    wordcloud_src = generate_wordcloud(df_filtered)

    if isinstance(n_clicks, int) and n_clicks > 0 and text:
        input_dict = dict(zip(categorical_cols + numerical_cols, input_values))
        input_dict['Job Description'] = text
        df = pd.DataFrame([input_dict])

        df['fasttext_vector'] = [vectorize(df['Job Description'].iloc[0])]
        for col in numerical_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col].fillna(3.780978090766823, inplace=True)

        to_test = preprocess_single_row(df.iloc[[0]], combined_features, categorical_cols, numerical_cols, df_vector_column)
        model_mlp.eval()
        sample_input = torch.tensor(to_test, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            prediction = model_mlp(sample_input)
        salary_estimate = round(prediction.item(), 2)

        num_topics = lda_model.num_topics
        
        topic_names = {
            0: "Healthcare & Research",
            1: "Equality Consciousness",
            2: "Business Analysts & Project Management",
            3: "Management & High paid roles",
            4: "Data Science & ML",
            5: "Data Engineering & Cloud"
        }
        
        results = get_top_topics(text, lda_model, dictionary, preprocess_text, top_n=num_topics)
        
        # Mapping
        topic_labels = [topic_names.get(res['topic_id'], f"Topic {res['topic_id']}") for res in results]
        
        bar_chart = dcc.Graph(
            figure=go.Figure(
                data=[
                    go.Bar(
                        y=topic_labels,
                        x=[res['probability'] for res in results],
                        text=[f"{res['probability']:.4f}" for res in results],
                        textposition='outside',
                        orientation='h',
                        marker=dict(color='rgba(30, 144, 255, 0.8)', line=dict(width=1, color='white')),
                        hoverinfo='x+y'
                    )
                ],
                layout=go.Layout(
                    title=dict(text="Topic Probabilities", font=dict(size=20, color='#003366')),
                    xaxis=dict(
                        title="Probability",
                        gridcolor='lightgrey',
                        color='#003366',  # Eje X en azul oscuro
                        titlefont=dict(color='#003366'),
                        tickfont=dict(color='#003366')
                    ),
                    yaxis=dict(
                        title="Topic",
                        categoryorder='array',
                        categoryarray=topic_labels,
                        titlefont=dict(color='#003366'),   # Color del título del eje Y
                        tickfont=dict(color='#003366')     # Color de las etiquetas del eje Y
                    ),
                    height=30 * num_topics + 120,
                    plot_bgcolor='white',
                    paper_bgcolor='white',
                    font=dict(color='#003366'),
                    margin=dict(l=180, r=20, t=60, b=40)  # l aumentado para que no se corte
                )

            )
        )


        similar_docs = get_most_similar_docs(text, data, vectorize, vector_col='fasttext_vector', text_col='clean_text', top_n=5)
        collapsibles = html.Div([
            html.Details([
                html.Summary(
                    f"Top {i+1} - Similarity: {doc['similarity']:.4f} - {data.loc[doc['index'], 'Job Title']} at {data.loc[doc['index'], 'Company Name']}"
                ),
                html.P([html.Strong("Job Title: "), data.loc[doc['index'], 'Job Title']]),
                html.P([html.Strong("Company Name: "), data.loc[doc['index'], 'Company Name']]),
                html.P([html.Strong("Salary Range: "), f"{data.loc[doc['index'], 'Low_salary_estimate (K)']}K - {data.loc[doc['index'], 'High_salary_estimate (K)']}K"]),
                html.P(doc['text'])
            ], style={
                'margin-bottom': '15px',
                'padding': '15px',
                'border': '1px solid #ccc',
                'border-radius': '10px',
                'box-shadow': '0 2px 5px rgba(0, 0, 0, 0.05)',
                'background-color': 'white'
            })
            for i, doc in enumerate(similar_docs)
        ])

        return display_text, fig, html.Div([bar_chart]), collapsibles, html.H4(f"Predicted High Salary Estimate (K): {salary_estimate}"), wordcloud_src

    return display_text, fig, "", "", "", wordcloud_src


# -----------------------------
# Execute app
# -----------------------------
if __name__ == '__main__':
    app.run(debug=True)