In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import bokeh.plotting as bpl

In [2]:
bpl.output_notebook()

Definición de funciones necesarias para entrenar el modelo de alcance

In [3]:
def entrenador(arreglo,alcances):
    """Entrena el modelo utilizando un arreglo de publicaciones o un dataframe y sus alcances.
    
    Parameters:
        arreglo (arreglo de numpy, también puede ser un dataframe):
            Arreglo multidimensional con los valores de las métricas para cada publicación.
            Cada publicación está en una fila del arreglo.
            El orden de las métricas debe ser el siguiente [likes,love,angry,wow,haha,sad,shares].
        alcances (arreglo de numpy, tambien puede ser una serie):
            Arreglo unidimensional con los valores de los alcances para cada publicación.
            Cada publicación está en una fila del arreglo.
        
    Returns:
        red (red neuronal de Sklearn):
            Modelo de red neuronal entrenada para predecir los alcances de publicaciones.
            
    """
    logtrain = np.log1p(arreglo)
    logpredi = np.log1p(alcances)
    
    red = MLPRegressor(alpha=0.01, hidden_layer_sizes = (10,), max_iter = 50000, 
                 activation = 'logistic', learning_rate = 'adaptive',solver= 'lbfgs')
    
    red.fit(logtrain,logpredi)
    
    return red

In [4]:
def predictor(arreglo,modelo):
    """Predice los alcances para un arreglo de publicaciones o un dataframe.
    
    Parameters:
        arreglo (arreglo de numpy, también puede ser un dataframe):
            Arreglo multidimensional con los valores de las métricas para cada publicación.
            Cada publicación está en una fila del arreglo.
            El orden de las métricas debe ser el siguiente [likes,love,angry,wow,haha,sad,shares].
            
        modelo (modelo de sklearn):
            El modelo de predicción entrenado previamente
            
    Returns:
        alcances (arreglo de numpy):
            Arreglo con los alcances para cada publicación.
            
    """
    logdata = np.log1p(arreglo)
    predata = modelo.predict(logdata)
    bacdata = np.expm1(predata)
    
    return bacdata

Carga y procesado de datos para entrenar la red para predecir alcance

In [5]:
data = pd.read_csv("../data/originales/posts.csv")

In [6]:
metricas = ['likes', 'love', 'angry', 'wow', 'haha', 'sad', 'shares']

In [7]:
data["reacciones"] = data[metricas].sum(1)

In [8]:
fdata=data[(data["scope"]!=0)&(data["reacciones"]>10)&(data["reacciones"]<=data["scope"])]

In [9]:
mdata = fdata[metricas + ["scope"]]

In [10]:
mdata[:3]

Unnamed: 0,likes,love,angry,wow,haha,sad,shares,scope
0,18,7,0,0,0,0,4,3660
1,526,117,189,15,51,8,107,77468
2,28,1,0,0,0,0,13,4399


In [11]:
arr_metricas = mdata[metricas].values

In [12]:
arr_metricas

array([[ 18,   7,   0, ...,   0,   0,   4],
       [526, 117, 189, ...,  51,   8, 107],
       [ 28,   1,   0, ...,   0,   0,  13],
       ...,
       [ 23,   0,   1, ...,   0,   3,   1],
       [ 13,   0,   4, ...,   0,   0,   0],
       [ 24,   0,   0, ...,   0,   0,   0]], dtype=int64)

In [13]:
arr_alcances = mdata["scope"].values

Carga de datos de publicaciones y temas

In [14]:
datap = pd.read_csv("../data/originales/Post_CDMX.tsv",sep='\t')
datat = pd.read_csv("../data/originales/Temas_CDMX.tsv",sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


Proceso de datos de temas y publicaciones

In [15]:
datat.columns=[cadena + "_T" for cadena in datat.columns]

In [16]:
datat.rename(columns={"id_T":"idTema"},inplace=True)

Mezcla de los datos de publicaciones y de temas

In [17]:
datamix = pd.merge(datap,datat,how="left",on="idTema")

Llenado de datos vacíos a cero

In [18]:
datafp = datamix[metricas].fillna(0)

Predicción de alcances para todas las publicaciones

In [19]:
datapv = datafp.values

In [20]:
estado = ["Ciudad de México"]
alcmax = 8500000
pubmax = 200

In [21]:
def alcance_extra(serie,atope):
    nserie = serie.apply(lambda x: min(x,atope))
    serie_s = nserie.sort_values(ascending=False)
    index = serie_s.index
    a_max  = min(atope,serie_s.max())
    r = (atope - a_max)/atope
    rango = pd.Series(pd.RangeIndex(0,len(serie_s)),index=index)
    mults = np.power(r,rango)
    return (serie_s*mults).sum()-a_max

In [22]:
def scale(inp_domain,out_range,valor):
    valor_estimado = ((out_range[1]-out_range[0])*(valor-inp_domain[0])/(inp_domain[1]-inp_domain[0])) + out_range[0]
    return np.clip(valor_estimado,out_range[0],out_range[1])

In [23]:
peso_alcance = 50
peso_temas = 50

In [28]:
alcances = []
alcance_max = []
alcance_ext = []
alcance_suma = []
cal_alcance = []
calificaciones = []
for i in range(1000):
    if i % 10 == 0:
        print(i)
    red = entrenador(arr_metricas,arr_alcances)
    predicciones = predictor(datapv,red)
    prediccion = pd.DataFrame(predicciones,columns=["Alcance_estimado_" + str(i).zfill(4)],index=datafp.index)
    alcances.append(prediccion)
    datamix["Alcance_estimado"] = prediccion
    datamix["reacciones"] = datamix[metricas].sum(1)
    data_filt=datamix[(datamix["estado_T"].isin(estado))][metricas+["score_T","Alcance_estimado","reacciones","idTema","nombre_T"]]
    data_filt["Alcance_est_top"] = data_filt["Alcance_estimado"].apply(lambda x: min(x,alcmax))
    grupos = data_filt.groupby(["idTema","nombre_T"])
    por_tema = grupos.apply(lambda x: alcance_extra(x["Alcance_est_top"],alcmax)).to_frame("Alcance_extra")
    por_tema["Alcance_max_top"] = grupos["Alcance_est_top"].max()
    por_tema["Alcance_suma"] = por_tema["Alcance_max_top"] + por_tema["Alcance_extra"]
    por_tema["Publicaciones"] = grupos["Alcance_estimado"].size()
    por_tema["Cal_alcance"] = scale((0,np.log10(alcmax)),(0,peso_alcance),np.log10(por_tema["Alcance_suma"]))
    por_tema["Cal_publicaciones"] = scale((0,np.log10(pubmax)),(0,peso_temas),np.log10(por_tema["Publicaciones"]))
    por_tema["Calificacion"] = por_tema["Cal_alcance"] + por_tema["Cal_publicaciones"]
    salida = por_tema[["Alcance_max_top","Alcance_extra","Alcance_suma","Publicaciones","Cal_alcance","Cal_publicaciones","Calificacion"]]
    lista_final = salida.sort_values(["Calificacion","Alcance_suma"],ascending=False)
    alcance_max.append(lista_final["Alcance_max_top"])
    alcance_ext.append(lista_final["Alcance_extra"])
    alcance_suma.append(lista_final["Alcance_suma"])
    cal_alcance.append(lista_final["Cal_alcance"])
    calificaciones.append(lista_final["Calificacion"])

0


  result = getattr(ufunc, method)(*inputs, **kwargs)


10
20
30
40
50
60
70
80
90
100
110
120


  result = getattr(ufunc, method)(*inputs, **kwargs)


130
140
150
160
170


  result = getattr(ufunc, method)(*inputs, **kwargs)


180
190
200
210
220
230
240
250
260
270
280
290


  result = getattr(ufunc, method)(*inputs, **kwargs)


300


  result = getattr(ufunc, method)(*inputs, **kwargs)


310
320


  result = getattr(ufunc, method)(*inputs, **kwargs)


330


  result = getattr(ufunc, method)(*inputs, **kwargs)


340
350
360
370
380
390


  result = getattr(ufunc, method)(*inputs, **kwargs)


400
410


  result = getattr(ufunc, method)(*inputs, **kwargs)


420
430
440
450


  result = getattr(ufunc, method)(*inputs, **kwargs)


460
470
480
490
500
510
520
530
540
550
560


  result = getattr(ufunc, method)(*inputs, **kwargs)


570
580
590
600


  result = getattr(ufunc, method)(*inputs, **kwargs)


610
620


  result = getattr(ufunc, method)(*inputs, **kwargs)


630
640
650
660


  result = getattr(ufunc, method)(*inputs, **kwargs)


670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850


  result = getattr(ufunc, method)(*inputs, **kwargs)


860
870
880
890
900
910
920
930
940
950
960
970
980
990


In [29]:
alcances_ens = pd.concat(alcances,axis=1)
alcance_max_ens = pd.concat(alcance_max,axis=1)
alcance_ext_ens = pd.concat(alcance_ext,axis=1)
alcance_suma_ens = pd.concat(alcance_suma,axis=1)
cal_alcance_ens = pd.concat(cal_alcance,axis=1)
calificaciones_ens = pd.concat(calificaciones,axis=1)

In [30]:
alcances_ens.to_csv("../data/procesados/ensamble/alcances_ens.csv")
alcance_max_ens.to_csv("../data/procesados/ensamble/alcance_max_ens.csv")
alcance_ext_ens.to_csv("../data/procesados/ensamble/alcance_ext_ens.csv")
alcance_suma_ens.to_csv("../data/procesados/ensamble/alcance_suma_ens.csv")
cal_alcance_ens.to_csv("../data/procesados/ensamble/cal_alcance_ens.csv")
calificaciones_ens.to_csv("../data/procesados/ensamble/calificaciones_ens.csv")

Entrenado de la red predictora de alcance

In [36]:
calsdesc = calificaciones_ens.T.describe().T
calsdesc

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
idTema,nombre_T,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,El gran varon,1000.0,65.231961,0.508323,64.251004,64.997087,65.183236,65.418537,74.183546
2,Prueba :),1000.0,54.731092,0.251808,54.295784,54.598604,54.714080,54.847731,60.122280
3,Charly el gran varon,1000.0,45.266435,0.770983,42.564196,44.866288,45.336078,45.811648,47.717598
4,Metro cdmx,1000.0,35.457626,0.657421,33.504602,35.022065,35.432936,35.807871,38.181893
5,Tren Suburbano,1000.0,56.738188,0.574569,55.287886,56.345239,56.712638,57.083690,59.087722
...,...,...,...,...,...,...,...,...,...
19091,CDMX aumentará a 2700 pruebas diarias para detectar COVID-19,1000.0,46.969379,0.183740,46.297050,46.848487,46.977090,47.090800,47.547147
19094,Dos sujetos armados roban vehículo a pareja en col. Reforma Política Iztapalapa,1000.0,29.441846,0.512536,27.814421,29.087518,29.424876,29.812696,30.732849
19095,Dan de alta a 18 pacientes del hospital temporal COVID del Autódromo,1000.0,28.288913,0.110177,27.924450,28.218657,28.294886,28.359003,28.621621
19096,Identifican a Los Mayas cártel de Morelos aliado con La Familia Michoacana que opera en CDMX,1000.0,27.075149,0.616349,25.575781,26.643204,27.027731,27.540931,28.911565


In [38]:
calsdesc.loc[18989]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
nombre_T,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Menor de 14 años se suicida al colgarse en cabaña del Bosque de Nativitas Xochimilco,1000.0,27.620614,3.025092,10.59788,26.418047,27.585017,28.97916,44.065365


In [61]:
alcances_ens.min().describe()

count    1000.000000
mean       45.008684
std        50.372420
min        -0.898097
25%        15.869730
50%        34.156316
75%        58.998346
max       924.174061
dtype: float64

In [64]:
alcances_ens.max().describe()

count    1.000000e+03
mean     1.270334e+07
std      3.682100e+07
min      5.063726e+06
25%      7.879568e+06
50%      9.514434e+06
75%      1.251820e+07
max      1.140812e+09
dtype: float64

In [73]:
datamix[['id', 'idTema', 'titulo','reacciones','likes', 'love', 'angry', 'sad', 'haha', 'wow', 'shares']].reset_index()

Unnamed: 0,index,id,idTema,titulo,reacciones,likes,love,angry,sad,haha,wow,shares
0,0,56880,9846,,1074.0,197.0,2.0,221.0,4.0,34.0,239.0,377.0
1,1,57185,10224,,0.0,,,,,,,
2,2,85025,15931,,16.0,7.0,1.0,0.0,0.0,0.0,3.0,5.0
3,3,59320,10653,,1858.0,205.0,0.0,808.0,6.0,22.0,35.0,782.0
4,4,65239,11823,,619.0,273.0,89.0,6.0,1.0,10.0,10.0,230.0
...,...,...,...,...,...,...,...,...,...,...,...,...
49857,49857,12190,2015,,0.0,,,,,,,
49858,49858,35749,6025,,0.0,,,,,,,
49859,49859,17073,2858,,0.0,,,,,,,
49860,49860,13544,2279,,0.0,,,,,,,


In [71]:
alcances_ens.idxmax().

Alcance_estimado_0000    18958
Alcance_estimado_0001    16087
Alcance_estimado_0002     4473
Alcance_estimado_0003    18815
Alcance_estimado_0004     4473
                         ...  
Alcance_estimado_0995    16356
Alcance_estimado_0996    18815
Alcance_estimado_0997    17355
Alcance_estimado_0998    18958
Alcance_estimado_0999    18958
Length: 1000, dtype: int64

In [39]:
alcdesc = alcances_ens.T.describe().T
alcdesc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1000.0,93512.062910,23635.049655,25630.549222,78272.585064,92201.058595,108685.497462,196322.097747
1,1000.0,246.472619,2755.880893,-0.898097,18.391679,41.353723,76.293949,79938.121295
2,1000.0,2311.888589,228.625492,1691.402152,2159.636672,2290.570771,2442.181511,3378.831979
3,1000.0,170800.328755,56909.102575,17544.031113,136286.743591,167078.926152,199630.299815,597944.330114
4,1000.0,57874.162834,3171.015905,49241.072624,55695.054272,57809.662149,59872.077849,68953.207953
...,...,...,...,...,...,...,...,...
49857,1000.0,246.472619,2755.880893,-0.898097,18.391679,41.353723,76.293949,79938.121295
49858,1000.0,246.472619,2755.880893,-0.898097,18.391679,41.353723,76.293949,79938.121295
49859,1000.0,246.472619,2755.880893,-0.898097,18.391679,41.353723,76.293949,79938.121295
49860,1000.0,246.472619,2755.880893,-0.898097,18.391679,41.353723,76.293949,79938.121295


In [40]:
alcdesc.loc[18989]

count    1.000000e+03
mean     6.122706e+05
std      1.819471e+05
min      1.620387e+05
25%      4.879977e+05
50%      5.975068e+05
75%      7.134182e+05
max      1.772403e+06
Name: 18989, dtype: float64

In [42]:
datamix.columns

Index(['id', 'idTema', 'titulo', 'imagen', 'url', 'fbid', 'fechaPub',
       'fechaCaptura', 'fechaMod', 'estado', 'municipio', 'categorias',
       'secretarias', 'likes', 'love', 'angry', 'sad', 'haha', 'wow', 'shares',
       'idAnalitycs', 'estatus', 'nombre_T', 'estado_T', 'municipio_T',
       'categorias_T', 'secretarias_T', 'Linea_T', 'Estacion_T', 'score_T',
       'scoreManual_T', 'Destacado_T', 'estatus_T', 'imagen_T', 'url_T',
       'fechaCreacion_T', 'fechaInsercion_T', 'fechaMod_T', 'idAnalitycs_T',
       'Alcance_estimado', 'reacciones'],
      dtype='object')

In [46]:
alcances_desc = pd.concat([datamix[['id', 'idTema', 'titulo','reacciones','likes', 'love', 'angry', 'sad', 'haha', 'wow', 'shares']],alcdesc],axis=1)

In [49]:
alcances_desc[alcances_desc["idTema"]==18989]

Unnamed: 0,id,idTema,titulo,reacciones,likes,love,angry,sad,haha,wow,shares,count,mean,std,min,25%,50%,75%,max
355,96207,18989,,3.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1000.0,346.642505,1735.54627,0.795149,133.562412,216.029102,353.072053,45215.408063
378,96226,18989,,3.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1000.0,1394.353754,6866.032466,0.031687,363.139795,550.153594,926.115456,129458.499363
