In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import metrics
import re
import random

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers
import torch


seed = 42
n_folds = 5

random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'submission')
OUTPUT_MODEL_DIR = DIR + '/output/model/GRU/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [4]:
train = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))
sub = pd.read_csv(os.path.join(INPUT_DIR,'sample_submit.csv'),header=None)
sub.columns = ['id','state']

In [5]:
display(train.head())
print(train.shape)
display(test.head())
print(test.shape)
display(sub.head())
print(sub.shape)

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0
3,train_00003,1001-2000,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1
4,train_00004,1001-2000,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1


(9791, 8)


Unnamed: 0,id,goal,country,duration,category1,category2,html_content
0,test_00000,5001-6000,FR,30,dance,performances,"<div class=""contents""><div><p>Bonjour ,</p><p>..."
1,test_00001,6001-7000,GB,23,publishing,children's books,"<div class=""contents""><div><p><span class=""bol..."
2,test_00002,6001-7000,GB,30,theater,plays,"<div class=""contents""><div><p>COW is a rural t..."
3,test_00003,1001-2000,CA,14,art,digital art,"<div class=""contents""><div><p>I've been creati..."
4,test_00004,1-1000,US,30,music,hip-hop,"<div class=""contents""><div><div class=""templat..."


(9800, 7)


Unnamed: 0,id,state
0,test_00000,1
1,test_00001,0
2,test_00002,0
3,test_00003,0
4,test_00004,1


(9800, 2)


In [6]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)


def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

In [7]:
train['html_content'] = cleaning(train['html_content'])
test['html_content'] = cleaning(test['html_content'])

In [8]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use


## split to train and val
skf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.state)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)


preds = []
oof_df = pd.DataFrame()
for fold in range(n_folds):
  print("="*10+f" fold = {fold} "+"="*10)

  train_df = train.loc[train.kfold!=fold]
  val_df = train.loc[train.kfold==fold]

  ## fill up the missing values
  train_X = train_df["html_content"].fillna("_na_").values
  val_X = val_df["html_content"].fillna("_na_").values
  test_X = test["html_content"].fillna("_na_").values

  ## Tokenize the sentences
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(list(train_X))
  train_X = tokenizer.texts_to_sequences(train_X)
  val_X = tokenizer.texts_to_sequences(val_X)
  test_X = tokenizer.texts_to_sequences(test_X)

  ## Pad the sentences 
  train_X = pad_sequences(train_X, maxlen=maxlen)
  val_X = pad_sequences(val_X, maxlen=maxlen)
  test_X = pad_sequences(test_X, maxlen=maxlen)

  ## Get the target values
  train_y = train_df['state'].values
  val_y = val_df['state'].values

  inp = Input(shape=(maxlen,))
  x = Embedding(max_features, embed_size)(inp)
  x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
  x = GlobalMaxPool1D()(x)
  x = Dense(16, activation="relu")(x)
  x = Dropout(0.1)(x)
  x = Dense(1, activation="sigmoid")(x)
  model = Model(inputs=inp, outputs=x)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(train_X, train_y, batch_size=512, epochs=3, validation_data=(val_X, val_y))

  pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
  pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)

  val_df['pred_noemb'] = pred_noemb_val_y
  preds.append(pred_noemb_test_y)

  oof_df = pd.concat([oof_df, val_df])

  model.save(OUTPUT_MODEL_DIR+f'GRU_model_fold{fold}.h5')

  del model,train_df,val_df,train_X,val_X,test_X,pred_noemb_val_y,pred_noemb_test_y

Epoch 1/3
Epoch 2/3
Epoch 3/3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
best_score = 0
best_thresh = 0.5
for thresh in np.arange(0.2, 0.801, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(oof_df.state, (oof_df.pred_noemb>thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
      best_score = score
      best_thresh = thresh
print()
print("best F1 score at threshold {0} is {1}".format(best_thresh, metrics.f1_score(oof_df.state, (oof_df.pred_noemb>best_thresh).astype(int))))

F1 score at threshold 0.2 is 0.67994289793005
F1 score at threshold 0.21 is 0.6826916156890968
F1 score at threshold 0.22 is 0.6846729107674436
F1 score at threshold 0.23 is 0.6869189664007028
F1 score at threshold 0.24 is 0.6902825025883744
F1 score at threshold 0.25 is 0.6923363928065069
F1 score at threshold 0.26 is 0.6942584092106254
F1 score at threshold 0.27 is 0.6958504331965345
F1 score at threshold 0.28 is 0.6970464135021096
F1 score at threshold 0.29 is 0.6990532360701537
F1 score at threshold 0.3 is 0.7012640339169349
F1 score at threshold 0.31 is 0.7021749484045087
F1 score at threshold 0.32 is 0.7058634538152609
F1 score at threshold 0.33 is 0.7073567708333333
F1 score at threshold 0.34 is 0.7092409240924092
F1 score at threshold 0.35 is 0.7089452603471295
F1 score at threshold 0.36 is 0.7096610169491525
F1 score at threshold 0.37 is 0.7095278969957083
F1 score at threshold 0.38 is 0.7092495636998256
F1 score at threshold 0.39 is 0.70911183919242
F1 score at threshold 0.4 

In [10]:
oof = train.merge(oof_df[["id","kfold","pred_noemb"]],how='left',on=["id","kfold"])
oof.to_csv(os.path.join(OUTPUT_DIR,"GRUmodel_oof.csv"),index=False)
oof

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,kfold,pred_noemb
0,train_00000,20001-21000,US,45,art,mixed media,"The Shillito's Elves attracted close to 100,00...",1,4,0.551170
1,train_00001,19001-20000,US,59,food,restaurants,Cultural Pretzel Sports Bar is a place where p...,0,3,0.240610
2,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,0,0.382052
3,train_00003,1001-2000,US,30,art,mixed media,"\n\n\n\n\n\nCanyon de Chelley, Dine' (Navajo) ...",1,1,0.738643
4,train_00004,1001-2000,US,29,film & video,webseries,"The story of the show, both on and off screen,...",1,2,0.707163
...,...,...,...,...,...,...,...,...,...,...
9786,train_09786,1-1000,US,15,music,electronic music,So the story behind this is that I've been mak...,0,2,0.187603
9787,train_09787,3001-4000,CA,30,fashion,ready-to-wear,THE HIGH CLOTHINGMy visio...,0,3,0.150799
9788,train_09788,100000+,GB,30,technology,software,We don't think anybody looks forward to fillin...,0,0,0.468017
9789,train_09789,79001-80000,US,35,technology,gadgets,\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n What is...,1,0,0.715171


In [11]:
predictions =np.mean(preds, axis=0)
test["pred_noemb"] = predictions
test.to_csv(os.path.join(OUTPUT_DIR,"GRUmodel_test.csv"),index=False)
test

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,pred_noemb
0,test_00000,5001-6000,FR,30,dance,performances,"Bonjour ,Je m'appelle Morgane Hilgers. Je suis...",0.306340
1,test_00001,6001-7000,GB,23,publishing,children's books,The projectThe hidden world of microorganisms ...,0.594024
2,test_00002,6001-7000,GB,30,theater,plays,"COW is a rural tragicomedy with songs, written...",0.623170
3,test_00003,1001-2000,CA,14,art,digital art,I've been creating landscape images using crea...,0.281092
4,test_00004,1-1000,US,30,music,hip-hop,\n\n\n\nIve been making hip hop music for ten ...,0.361561
...,...,...,...,...,...,...,...,...
9795,test_09795,4001-5000,US,29,music,world music,How Tibetana Started\nIt all began after the t...,0.711607
9796,test_09796,10001-11000,US,30,publishing,children's books,The Wild Waves Whist is a board book that take...,0.678455
9797,test_09797,2001-3000,US,30,music,hip-hop,,0.391358
9798,test_09798,7001-8000,US,30,theater,plays,Have you ever read a book or seen a movie and ...,0.357092


In [12]:
preds2 = []
_df = pd.DataFrame()
for fold in range(n_folds):
  print("="*10+f" fold = {fold} "+"="*10)

  oof_val = oof.loc[oof.kfold==fold]
  oof_train = oof.loc[oof.kfold!=fold]

  ## fill up the missing values
  train_X = oof_train["html_content"].fillna("_na_").values
  val_X = oof_val["html_content"].fillna("_na_").values
  test_X = test["html_content"].fillna("_na_").values

  ## Tokenize the sentences
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(list(train_X))
  val_X = tokenizer.texts_to_sequences(val_X)
  test_X = tokenizer.texts_to_sequences(test_X)

  ## Pad the sentences 
  val_X = pad_sequences(val_X, maxlen=maxlen)
  test_X = pad_sequences(test_X, maxlen=maxlen)

  #inp = Input(shape=(maxlen,))
  #x = Embedding(max_features, embed_size)(inp)
  #x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
  #x = GlobalMaxPool1D()(x)
  #x = Dense(16, activation="relu")(x)
  #x = Dropout(0.1)(x)
  #x = Dense(1, activation="sigmoid")(x)
  #model = Model(inputs=inp, outputs=x)
  #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model = load_model(OUTPUT_MODEL_DIR+f'GRU_model_fold{fold}.h5')

  pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
  pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)

  oof_val['pred_noemb'] = pred_noemb_val_y
  preds2.append(pred_noemb_test_y)

  _df = pd.concat([_df, oof_val])

  del model,oof_train,oof_val,val_X,test_X,pred_noemb_val_y,pred_noemb_test_y



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




In [13]:
best_score = 0
best_thresh = 0.5
for thresh in np.arange(0.2, 0.801, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(_df.state, (_df.pred_noemb>thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, score))
    if score > best_score:
      best_score = score
      best_thresh = thresh
print()
print("best F1 score at threshold {0} is {1}".format(best_thresh, metrics.f1_score(_df.state, (_df.pred_noemb>best_thresh).astype(int))))

F1 score at threshold 0.2 is 0.67994289793005
F1 score at threshold 0.21 is 0.6826916156890968
F1 score at threshold 0.22 is 0.6846729107674436
F1 score at threshold 0.23 is 0.6869189664007028
F1 score at threshold 0.24 is 0.6902825025883744
F1 score at threshold 0.25 is 0.6923363928065069
F1 score at threshold 0.26 is 0.6942584092106254
F1 score at threshold 0.27 is 0.6958504331965345
F1 score at threshold 0.28 is 0.6970464135021096
F1 score at threshold 0.29 is 0.6990532360701537
F1 score at threshold 0.3 is 0.7012640339169349
F1 score at threshold 0.31 is 0.7021749484045087
F1 score at threshold 0.32 is 0.7058634538152609
F1 score at threshold 0.33 is 0.7073567708333333
F1 score at threshold 0.34 is 0.7092409240924092
F1 score at threshold 0.35 is 0.7089452603471295
F1 score at threshold 0.36 is 0.7096610169491525
F1 score at threshold 0.37 is 0.7095278969957083
F1 score at threshold 0.38 is 0.7092495636998256
F1 score at threshold 0.39 is 0.70911183919242
F1 score at threshold 0.4 

In [14]:
a = np.mean(preds2, axis=0)
a

array([[0.30634007],
       [0.59402406],
       [0.6231698 ],
       ...,
       [0.391358  ],
       [0.3570916 ],
       [0.72740537]], dtype=float32)