In [17]:
import requests
import json
import csv
import multiprocessing as mp
from itertools import repeat
import os
import time
import random
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import folium
import sqlalchemy
import tables
import pickle
from pathlib import Path
from collections import OrderedDict
import numpy as np
import pandas as pd
import shapely
import geopandas as gpd
from datetime import datetime, timedelta, time, date
from geopy.distance import vincenty, great_circle
from sqlalchemy.orm import sessionmaker #Run pip install sqlalchemy
from sklearn.cluster import DBSCAN
from shapely.geometry import Point
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelBinarizer, LabelEncoder



%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)

# os.chdir('src')

import settings as s
import utility_io
import utility_database
import prediction_gps_grid_mode_topic
np.random.seed(1)
random.seed(1)

pd.set_option('display.width',200)



In [None]:
engine, conn, metadata = utility_database.establish_db_connection_mysql_twitter_remote()

In [6]:
# Check model file and topic feature file

EXPERIMENT_PARAMETERS = s.EXPERIMENT_PARAMETERS
LSI_MODEL_FILE = s.LSI_MODEL_FILE
LDA_MODEL_FILE = s.LDA_MODEL_FILE
DOC2VEC_MODEL_FILE = s.DOC2VEC_MODEL_FILE
STOPLIST_FILE = s.STOPLIST_FILE
DICT_FILE = s.DICT_FILE
MM_CORPUS_FILE = s.MM_CORPUS_FILE
TFIDF_FILE = s.TFIDF_FILE
LSI_TOPIC_FILE = s.LSI_TOPIC_FILE
LDA_TOPIC_FILE = s.LDA_TOPIC_FILE
DOC2VEC_TOPIC_FILE = s.DOC2VEC_TOPIC_FILE

dictionary = gensim.corpora.Dictionary.load(DICT_FILE)
lsi = gensim.models.LsiModel.load(LSI_MODEL_FILE)
lda = gensim.models.LdaModel.load(LDA_MODEL_FILE)
doc2vec = Doc2Vec.load(DOC2VEC_MODEL_FILE)
tfidf = gensim.models.TfidfModel.load(TFIDF_FILE)


In [11]:
lda.show_topics(num_topics=50, num_words=20, formatted=True)

# lda.print_topics(num_topics=20, num_words=10)

[(0,
  '0.040*"the" + 0.030*"just" + 0.029*"i" + 0.018*"ousted" + 0.018*"on" + 0.014*"time" + 0.012*"q" + 0.012*"of" + 0.012*"v" + 0.011*"d" + 0.009*"mayor" + 0.009*"号" + 0.008*"amp" + 0.008*"宜しく" + 0.008*"神戸" + 0.007*"kt" + 0.007*"z" + 0.007*"a" + 0.007*"u" + 0.007*"as"'),
 (1,
  '0.062*"o" + 0.028*"ﾟ" + 0.023*"д" + 0.014*"ﾉ" + 0.014*"の" + 0.009*"ねー" + 0.009*"昨日" + 0.009*"ヽ" + 0.009*"ノ" + 0.009*"目" + 0.008*"いっ" + 0.008*"ｗ" + 0.008*"二" + 0.007*"しか" + 0.006*"為" + 0.006*"回" + 0.006*"て" + 0.005*"規制" + 0.005*"ら" + 0.005*"台風"'),
 (2,
  '0.044*"一" + 0.035*"高温" + 0.012*"中" + 0.011*"てしょ" + 0.011*"やん" + 0.010*"海" + 0.010*"郡" + 0.007*"rt" + 0.007*"にて" + 0.007*"ねぇ" + 0.007*"本" + 0.006*"生" + 0.006*"わかり" + 0.006*"やっは" + 0.005*"マシ" + 0.005*"こ飯" + 0.005*"読む" + 0.005*"ぅ" + 0.005*"ㅋ" + 0.004*"わかち"'),
 (3,
  '0.065*"て" + 0.058*"た" + 0.040*"か" + 0.039*"の" + 0.034*"は" + 0.033*"に" + 0.019*"ー" + 0.019*"し" + 0.018*"な" + 0.017*"と" + 0.016*"も" + 0.015*"ん" + 0.013*"ない" + 0.013*"笑" + 0.012*"よ" + 0.012*"てす" + 0.0

In [18]:
FIGURE_DIR = s.FIGURE_DIR

X_GRID_FILE = s.X_GRID_FILE
Y_GRID_FILE = s.Y_GRID_FILE
X_MODE_FILE = s.X_MODE_FILE
Y_MODE_FILE = s.Y_MODE_FILE
X_TOPIC_FILE = s.X_TOPIC_FILE
Y_TOPIC_FILE = s.Y_TOPIC_FILE

# X_GRID_FILE = s.X_GRID_EVALUATION_FILE
# Y_GRID_FILE = s.Y_GRID_EVALUATION_FILE
# X_MODE_FILE = s.X_MODE_EVALUATION_FILE
# Y_MODE_FILE = s.Y_MODE_EVALUATION_FILE
# X_TOPIC_FILE = s.X_TOPIC_EVALUATION_FILE
# Y_TOPIC_FILE = s.Y_TOPIC_EVALUATION_FILE

LE_GRID_CLASSES_FILE = s.LE_GRID_CLASSES_FILE
LB_MODE_CLASSES_FILE = s.LB_MODE_CLASSES_FILE
X_FILE = s.X_FILE
Y_FILE = s.Y_FILE
Y_FILE_PREDICTED_LSTM = s.Y_FILE_PREDICTED_LSTM
Y_FILE_PREDICTED_VELOCITY = s.Y_FILE_PREDICTED_VELOCITY
MODEL_FILE_LSTM_GRID = s.MODEL_FILE_LSTM_GRID
MODEL_WEIGHT_FILE_LSTM_GRID = s.MODEL_WEIGHT_FILE_LSTM_GRID
GEOJSON_FILE_OBSERVATION_GRID = s.GEOJSON_FILE_OBSERVATION_GRID
GEOJSON_FILE_TRUE_GRID = s.GEOJSON_FILE_TRUE_GRID
GEOJSON_FILE_PREDICTED_LSTM_GRID = s.GEOJSON_FILE_PREDICTED_LSTM_GRID

le_grid = LabelEncoder()
le_grid.classes_ = np.load(LE_GRID_CLASSES_FILE)
lb_mode = LabelBinarizer()
lb_mode.classes_ = np.load(LB_MODE_CLASSES_FILE)

X_train, y_train, X_test, y_test, X_mode_train, X_mode_test, y_mode_train, y_mode_test, X_topic_train, X_topic_test, y_topic_train, y_topic_test = \
    prediction_gps_grid_mode_topic.load_grid_mode_topic_dataset(X_GRID_FILE, Y_GRID_FILE, X_MODE_FILE, Y_MODE_FILE, X_TOPIC_FILE, Y_TOPIC_FILE, le_grid, lb_mode, EXPERIMENT_PARAMETERS)


  y = column_or_1d(y, warn=True)


In [22]:
# X_topic_test.shape
X_topic_test

array([[[-1.23133099, -1.07775474, -1.64188182, ...,  0.85134816,
          1.57933021, -2.11856055],
        [-1.62418473, -0.81821078, -1.61588955, ...,  0.21380956,
          0.20901272, -2.50474024],
        [-1.55443156, -0.18334562, -1.7333405 , ..., -0.32025668,
          0.79164249, -2.79221916],
        ...,
        [-0.73518175, -0.99641067, -1.91284704, ...,  1.57473397,
         -0.48859885, -2.20515037],
        [ 0.36108688, -1.13614345, -2.2031548 , ...,  1.39662278,
         -0.48712844, -1.86307263],
        [ 0.91818339, -1.0984956 , -2.40233541, ...,  0.90901917,
         -0.13185288, -1.74400342]],

       [[-0.92359304, -1.67816079, -3.0649035 , ..., -0.72474325,
          2.3668282 , -0.33827776],
        [-0.57502818, -1.70732784, -3.02119327, ..., -0.12723143,
          1.47198987, -0.40845656],
        [-0.04099904, -1.68510449, -2.52037954, ...,  0.34432566,
          1.99489498, -0.12592867],
        ...,
        [ 0.31750965, -1.28177428, -1.63847899, ..., -