# Criação orientada de amostra de treino #

In [None]:
"""
Create a sample to train a model to identify extreme events
"""

from glass.fm.psql            import query_to_df
from glass.gisp.pnd.mng       import split_df_inN
from glass.gisp.psql.prop import get_row_number
from glass.gisp.psql.mng.qw   import q_to_ntbl
from glass.gisp.psql.mng      import tbls_to_tbl
from glass.gisp.psql.mng._del import del_tables
from glass.to.xls             import df_to_xls

In [None]:
# Facedata

CON_PGSQL = {
    "HOST" : "localhost", "PORT" : "5432", "DATABASE" : "dsn_collect",
    "USER" : "postgres", "PASSWORD" : "admin", "TEMPLATE" : "postgis_template"
}

WORD_SCHEMA = {
    "TNAME" : "search_words",
    "WORD"  : "word"
}

DATA_SCHEMA = {
    "TNAME"      : (
        "("
        "SELECT post_id, "
        "CASE WHEN type = 'link' THEN description ELSE message END AS message, "
        "type, link, datahora, page_ref FROM facedata"
        ") AS foo"
    ),
    "FID"        : "post_id",
    "TEXT_COL"   : "message",
    "TIME"       : "datahora",
    "OTHER_COLS" : ["type", "page_ref"],
    "LOWER_TIME" : "2017-05-31 23:59:59",
    "HIGH_TIME"  : "2019-01-01 00:00:00"
}

SPLIT_IN = 10

OUTFILE = r'D:\gis\GEOTIMELINE\DSN_Collector\train_sample_facebook.xlsx'

In [None]:
# Twitter

CON_PGSQL = {
    "HOST" : "localhost", "PORT" : "5432", "DATABASE" : "dsn_collect",
    "USER" : "postgres", "PASSWORD" : "admin", "TEMPLATE" : "postgis_template"
}

WORD_SCHEMA = {
    "TNAME" : "search_words",
    "WORD"  : "word"
}

DATA_SCHEMA = {
    "TNAME"      : "twitter_data",
    "TEXT_COL"   : "text",
    "TIME"       : "tweet_time",
    "FID"        : "fid",
    "OTHER_COLS" : ["keyword"],
    "LOWER_TIME" : "2018-08-31 23:59:59",
    "HIGH_TIME"  : "2019-01-01 00:00:00"
}

SPLIT_IN = 10

OUTFILE = r'D:\gis\GEOTIMELINE\DSN_Collector\train_twitter.xlsx'

In [None]:
# Get words

wordsInterest = query_to_df(CON_PGSQL, "SELECT {} FROM {}".format(
    WORD_SCHEMA["WORD"], WORD_SCHEMA["TNAME"]
))[WORD_SCHEMA["WORD"]].tolist()

print wordsInterest

In [None]:
# Get sample table for each word

w = 1
TABLES = []

__TBL = ("("
    "SELECT * FROM {t} "
    "WHERE TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS') > "
    "TO_TIMESTAMP('{lowerTime}', 'YYYY-MM-DD HH24:MI:SS') AND "
    "TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS') < "
    "TO_TIMESTAMP('{highTime}', 'YYYY-MM-DD HH24:MI:SS')"
") AS jtbl").format(
    timeCol=DATA_SCHEMA["TIME"],
    lowerTime=DATA_SCHEMA["LOWER_TIME"],
    highTime=DATA_SCHEMA["HIGH_TIME"],
    t=DATA_SCHEMA["TNAME"]
)
    
for word in wordsInterest:
    ntbl = q_to_ntbl(CON_PGSQL, "data_{}".format(w), (
        "SELECT {fidCol} "
        "FROM {tbl} WHERE {txtc} LIKE '%{w}%'"
    ).format(
        tbl=__TBL, txtc=DATA_SCHEMA["TEXT_COL"],
        w=word, fidCol=DATA_SCHEMA["FID"]
    ))
    
    TABLES.append(ntbl)
    w += 1

In [None]:
SAMPLE_TBL = tbls_to_tbl(CON_PGSQL, TABLES, "sample_tbl_tmp")

In [None]:
NROWS_sample = get_row_number(CON_PGSQL, SAMPLE_TBL)

NROWS30 = int((30*NROWS_sample) / 100.0)
print NROWS_sample
print NROWS30

In [None]:
Q = (
    "SELECT REPLACE(REPLACE(REPLACE(regexp_replace("
        "REPLACE(REPLACE(REPLACE(regexp_replace("
        "{txtCol}, 'https://[^:\s]+(\S+)', '', 'g'), ' ', 'XXX'), '.', 'YYY'), "
        "'-', 'ZZZ'), '[^\w+]', '', 'g'), 'XXX', ' '), 'YYY', '.'), "
        "'ZZZ', '-') AS {txtCol}, "
    "{timeCol}, {fidCol}, {ocols} FROM ("
        "SELECT jtbl.{txtCol}, jtbl.{timeCol}, jtbl.{fidCol}, {otherCols}, "
        "row_number() OVER(PARTITION BY mtbl.{fidCol} ORDER BY mtbl.{fidCol}) "
        "AS count_row FROM {sample_tbl} AS mtbl FULL JOIN {o_tbl} "
        "ON mtbl.{fidCol} = jtbl.{fidCol}"
    ") AS foo WHERE count_row < {trinta}"
).format(
    txtCol=DATA_SCHEMA["TEXT_COL"], timeCol=DATA_SCHEMA["TIME"],
    fidCol=DATA_SCHEMA["FID"], ocols=", ".join(DATA_SCHEMA["OTHER_COLS"]),
    otherCols=", ".join(["jtbl.{}".format(c) for c in DATA_SCHEMA["OTHER_COLS"]]),
    sample_tbl=SAMPLE_TBL, o_tbl =__TBL,
    trinta = str(NROWS30)
)

REAL_SAMPLE_TBL = q_to_ntbl(CON_PGSQL, "sample_tbl_real", Q)

NROWS_check = get_row_number(CON_PGSQL, REAL_SAMPLE_TBL)

print NROWS_check

In [None]:
# Export data

DATA_SCHEMA["OTHER_COLS"].append(DATA_SCHEMA["FID"])

dataDf = query_to_df(CON_PGSQL, (
    "SELECT {txtCol}, {otherCols}, "
    "to_char(MIN(TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS')), "
        "'YYYY-MM-DD HH24:MI:SS') AS {timeCol} "
    "FROM {tbl} WHERE {txtCol} <> '' AND {txtCol} <> ' ' "
    "GROUP BY {txtCol}"
).format(
    txtCol = DATA_SCHEMA["TEXT_COL"],
    otherCols = ", ".join([(
        "REPLACE(REPLACE(CAST(array_agg({a} ORDER BY "
        "{b}) AS text), '{{', ''), '}}', '') AS {a}"
    ).format(
        a=DATA_SCHEMA["OTHER_COLS"][x],
        b=DATA_SCHEMA["OTHER_COLS"][0]
    ) for x in range(len(DATA_SCHEMA["OTHER_COLS"]))]),
    timeCol=DATA_SCHEMA["TIME"],
    tbl=REAL_SAMPLE_TBL
))

dfParts = split_df_inN(dataDf, SPLIT_IN)

df_to_xls(dfParts, OUTFILE, sheetsName=["sample_{}".format(i) for i in range(len(dfParts))])

In [None]:
del_tables(CON_PGSQL, TABLES + [SAMPLE_TBL, REAL_SAMPLE_TBL])

In [None]:
del_tables(CON_PGSQL, TABLES)

# Criação não orientada de uma amostra de treino #

In [None]:
# Twitter

CON_PGSQL = {
    "HOST" : "localhost", "PORT" : "5432", "DATABASE" : "dsn_collect",
    "USER" : "postgres", "PASSWORD" : "admin", "TEMPLATE" : "postgis_template"
}

WORD_SCHEMA = {
    "TNAME" : "search_words",
    "WORD"  : "word"
}

DATA_SCHEMA = {
    "TNAME"      : "twitter_data",
    "TEXT_COL"   : "text",
    "TIME"       : "tweet_time",
    "OTHER_COLS" : ["fid", "username", "keyword"],
    "LOWER_TIME" : "2017-05-31 23:59:59",
    "HIGH_TIME"  : "2019-01-01 00:00:00"
}

SPLIT_IN = 10

OUTFILE = r'D:\gis\GEOTIMELINE\DSN_Collector\train_sample_twitter.xlsx'

from glass.fm.psql import query_to_df
from glass.to.xls  import df_to_xls
from glass.gisp.pnd.mng import split_df_inN

# Export data

dataDf = query_to_df(CON_PGSQL, (
    "SELECT {santxtCol} AS {txtCol}, {otherCols}, "
    "to_char(MIN(TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS')), 'YYYY-MM-DD HH24:MI:SS') "
    "FROM {tbl} WHERE {txtCol} <> '' AND {txtCol} <> ' ' AND "
    "TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS') > TO_TIMESTAMP('{ltime}', 'YYYY-MM-DD HH24:MI:SS') "
    "AND TO_TIMESTAMP({timeCol}, 'YYYY-MM-DD HH24:MI:SS') < "
    "TO_TIMESTAMP('{htime}', 'YYYY-MM-DD HH24:MI:SS') "
    "GROUP BY {txtCol}"
).format(
    santxtCol=(
        "REPLACE(REPLACE(REPLACE(regexp_replace("
        "REPLACE(REPLACE(REPLACE(regexp_replace("
        "{}, 'https://[^:\s]+(\S+)', '', 'g'), ' ', 'XXX'), '.', 'YYY'), "
        "'-', 'ZZZ'), '[^\w+]', '', 'g'), 'XXX', ' '), 'YYY', '.'), "
        "'ZZZ', '-')"
    ).format(DATA_SCHEMA["TEXT_COL"]),
    txtCol = DATA_SCHEMA["TEXT_COL"],
    otherCols = ", ".join([(
        "REPLACE(REPLACE(CAST(array_agg({a} ORDER BY "
        "{b}) AS text), '{{', ''), '}}', '') AS {a}"
    ).format(
        a=DATA_SCHEMA["OTHER_COLS"][x],
        b=DATA_SCHEMA["OTHER_COLS"][0]
    ) for x in range(len(DATA_SCHEMA["OTHER_COLS"]))]),
    timeCol=DATA_SCHEMA["TIME"],
    ltime=DATA_SCHEMA["LOWER_TIME"], htime=DATA_SCHEMA["HIGH_TIME"],
    tbl=DATA_SCHEMA["TNAME"]
))

dfParts = split_df_inN(dataDf, SPLIT_IN)

df_to_xls(dfParts, OUTFILE, sheetsName=["sample_{}".format(i) for i in range(len(dfParts))])