## Python Modules

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.linear_model import LinearRegression
import plotly.express as px

## Transform dataset

In [2]:

FECHA_COL = 'Fecha_Publicacion'
TOKENS_COL = 'data_tokenized'

dataset = pd.read_csv('data/processed/dataset_fw.csv', converters={TOKENS_COL: eval})
dataset[FECHA_COL] = pd.to_datetime(dataset[FECHA_COL])
dataset['time_segment'] = dataset[FECHA_COL].dt.to_period('M').astype('category')

n = dataset['time_segment'].nunique()
time_segments = sorted(dataset['time_segment'].cat.categories.tolist())
time_index_map = {seg: i for i, seg in enumerate(time_segments)}

# CREAR SERIES TEMPORALES
term_time_series = {}
for period, group in dataset.groupby('time_segment'):
    all_words = [word for tokens in group[TOKENS_COL] for word in tokens]
    term_freq = Counter(all_words)
    doc_freq = Counter([word for tokens in group[TOKENS_COL] for word in set(tokens)])

    for term in term_freq:
        if term not in term_time_series:
            term_time_series[term] = {
                'TF': [0] * n,
                'DF': [0] * n,
                'TotalTerms': [0] * n,
                'TotalDocs': [0] * n,
            }
        idx = time_index_map[period]
        term_time_series[term]['TF'][idx] = term_freq[term]
        term_time_series[term]['DF'][idx] = doc_freq[term]
        term_time_series[term]['TotalTerms'][idx] = len(all_words)
        term_time_series[term]['TotalDocs'][idx] = len(group)

# CALCULAR DoV y DoD COMO PENDIENTE
records = []
tw = 0.05

for term, stats in term_time_series.items():
    tf = np.array(stats['TF'])
    df = np.array(stats['DF'])
    total_terms = np.array(stats['TotalTerms'])
    total_docs = np.array(stats['TotalDocs'])

    DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
    DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))

    if np.any(np.isnan(DoV)) or np.any(np.isnan(DoD)):
        continue
    if np.any(np.isinf(DoV)) or np.any(np.isinf(DoD)):
        continue

    X = np.arange(n).reshape(-1, 1)
    slope_dov = LinearRegression().fit(X, DoV).coef_[0]
    slope_dod = LinearRegression().fit(X, DoD).coef_[0]

    records.append({
        'Keyword': term,
        'Total_TF': np.sum(tf),
        'Total_DF': np.sum(df),
        'DoV': slope_dov,
        'DoD': slope_dod
    })

result_df = pd.DataFrame(records)

# PROMEDIOS PARA DETECCIÓN DE SEÑALES DÉBILES
mean_dov = result_df['DoV'].mean()
mean_dod = result_df['DoD'].mean()
mean_tf = result_df['Total_TF'].mean()
mean_df = result_df['Total_DF'].mean()

# CLASIFICACIÓN DE CUADRANTES (DETECCIÓN DE SEÑALES DÉBILES)
result_df['Weak_DoV'] = (result_df['DoV'] > mean_dov) & (result_df['Total_TF'] < mean_tf)
result_df['Weak_DoD'] = (result_df['DoD'] > mean_dod) & (result_df['Total_DF'] < mean_df)
result_df['Weak_Signal'] = result_df['Weak_DoV'] & result_df['Weak_DoD']

# MAPA KEM
fig_kem = px.scatter(result_df, x='Total_TF', y='DoV',
                     title='Keyword Emergence Map (KEM)',
                     hover_name='Keyword',
                     color='Weak_Signal',
                     color_discrete_map={True: 'red', False: 'gray'},
                     labels={'Total_TF': 'Frecuencia Total', 'DoV': 'Pendiente de Visibilidad'})
fig_kem.add_hline(y=mean_dov, line_dash='dash', line_color='blue')
fig_kem.add_vline(x=mean_tf, line_dash='dash', line_color='blue')
fig_kem.show()

# MAPA KIM
fig_kim = px.scatter(result_df, x='Total_DF', y='DoD',
                     title='Keyword Issue Map (KIM)',
                     hover_name='Keyword',
                     color='Weak_Signal',
                     color_discrete_map={True: 'red', False: 'gray'},
                     labels={'Total_DF': 'Frecuencia Documental', 'DoD': 'Pendiente de Difusión'})
fig_kim.add_hline(y=mean_dod, line_dash='dash', line_color='blue')
fig_kim.add_vline(x=mean_df, line_dash='dash', line_color='blue')
fig_kim.show()

# EXPORTAR SEÑALES DÉBILES DETECTADAS
weak_signals = result_df[result_df['Weak_Signal'] == True]
weak_signals.to_csv('data/processed/weak_signals_regression.csv', index=False)
print(weak_signals[['Keyword', 'DoV', 'DoD']].head(10))


  for period, group in dataset.groupby('time_segment'):
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = (tf / total_terms) * (1 - tw * (n - np.arange(n)))
  DoD = (df / total_docs) * (1 - tw * (n - np.arange(n)))
  DoV = 

        Keyword       DoV       DoD
93     ministra  0.000018  0.002569
112   excepcion  0.000016  0.002399
434     interes  0.000016  0.002677
885   construir  0.000015  0.002372
917    distrito  0.000018  0.002481
1352   terminal  0.000016  0.002470
1855   palacios  0.000015  0.003266
1976  conflicto  0.000020  0.003202
2105  vehicular  0.000016  0.002394
