In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (6,6)

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers.merge import add

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import np_utils
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

In [None]:
df = pd.read_csv('/content/gdrive/My Drive/data_science/text_mining/train_values.csv', sep=',')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['doc_text'])
X = tokenizer.texts_to_sequences(df['doc_text'])
df['words'] = X

In [None]:
df['word_length'] = df.words.apply(lambda i: len(i))

In [None]:
maxlen = 50
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
#參考資料:https://www.smwenku.com/a/5c113708bd9eee5e40bb23af/

In [None]:
word_index = tokenizer.word_index

In [None]:
EMBEDDING_DIM = 50

embeddings_index = {}
f = open('/content/gdrive/My Drive/data_science/text_mining/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s unique tokens.' % len(word_index))
print('Total %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)

In [None]:
Y=pd.read_csv('/content/gdrive/My Drive/data_science/text_mining/train_labels.csv',sep=',')
#Y的欄名:['row_id', 'information_and_communication_technologies', 'governance',
#       'urban_development', 'law_and_development', 'public_sector_development',
#       'agriculture', 'communities_and_human_settlements',
#       'health_and_nutrition_and_population', 'culture_and_development',
#       'environment', 'social_protections_and_labor', 'industry',
#       'macroeconomics_and_economic_growth',
#       'international_economics_and_trade', 'conflict_and_development',
#       'finance_and_financial_sector_development',
#       'science_and_technology_development', 'rural_development',
#       'poverty_reduction', 'private_sector_development', 'informatics',
#       'energy', 'social_development', 'water_resources', 'education',
#       'transport', 'water_supply_and_sanitation', 'gender',
#       'infrastructure_economics_and_finance']

In [None]:
Y=Y.drop('row_id',axis=1)

In [None]:
X = np.array(X)


# and split to training set and validation set

seed = 15
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.16, random_state=seed)

In [None]:
inp = Input(shape=(maxlen,), dtype='int32')
x = embedding_layer(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size=3)(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
outp = Dense(Y.shape[1], activation="softmax")(x)

BiGRU = Model(inp, outp)
BiGRU.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
bigru_history = BiGRU.fit(x_train, 
                          y_train, 
                          batch_size=128, 
                          epochs=40, 
                          validation_data=(x_val, y_val))

In [None]:
predicted1 = BiGRU.predict(x_val)

In [None]:
save=pd.DataFrame(predicted1)

In [None]:
from sklearn import cluster
for j in tqdm(range(0,save.shape[0])):
  dataf=pd.DataFrame(save.iloc[j,:])
  #dataf['other']=0
  temp=np.array(dataf)
  kmeans_fit = cluster.KMeans(n_clusters = 2).fit(temp)
  re_temp=kmeans_fit.predict(temp)
  for ii in range (0,29):
    f_result_copy.iloc[j,ii]=int(re_temp[ii])
  

In [None]:
k=0
for i in range(0,f_result_copy.shape[0]):
  if list(f_result_copy.iloc[i,:].astype(int))==list(y_val.iloc[i,:])  :
    k=k+1


In [None]:
k/f_result_copy.shape[0]

In [None]:
#用到的:https://drive.google.com/open?id=13Jqaug5zRdwscA82O-UhYoeM5WZ_dK1V