In [1]:
# https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.15.6406111aIKCSLV&postId=118255

# FastText
# FastText是一种典型的深度学习词向量的表示方法，它非常简单通过Embedding层将单词映射到稠密空间，
# 然后将句子中所有的单词在Embedding空间中进行平均，进而完成分类操作。

# 所以FastText是一个三层的神经网络，输入层、隐含层和输出层。


# FastText在文本分类任务上，是优于TF-IDF的：

# FastText用单词的Embedding叠加获得的文档向量，将相似的句子分为一类
# FastText学习到的Embedding空间维度比较低，可以快速进行训练

# 使用10折交叉验证，每折使用9/10的数据进行训练，剩余1/10作为验证集检验模型的效果。

In [2]:
# FastText官方教程 https://fasttext.cc/docs/en/supervised-tutorial.html

import fasttext

help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    eprint(*args, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might wan

In [3]:
!head cooking/cooking.stackexchange.txt

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?
__label__storage-method __label__equipment __label__bread What's the purpose of a bread box?
__label__baking __label__food-safety __label__substitutions __label__peanuts how to seperate peanut oil from roasted peanuts at home?
__label__chocolate American equivalent for British chocolate terms
__label__baking __label__oven __label__convection Fan bake vs bake
__label__sauce __label__storage-lifetime __label__acidity __label__mayonnaise Regulation and balancing of readymade packed mayonnaise and other sauces


In [4]:
!wc cooking/cooking.stackexchange.txt

   15404  169582 1401900 cooking/cooking.stackexchange.txt


In [5]:
!head -n 12404 cooking/cooking.stackexchange.txt > cooking/cooking.train
!tail -n 3000 cooking/cooking.stackexchange.txt > cooking/cooking.valid

In [6]:
model = fasttext.train_supervised(input="cooking/cooking.train")

In [7]:
model.save_model("cooking/model_cooking.bin")

In [8]:
model.predict("Which baking dish is best to bake a banana bread ?")

(('__label__baking',), array([0.0736256]))

In [9]:
model.predict("Why not put knives in the dishwasher?")

(('__label__baking',), array([0.0705011]))

In [10]:
model.test("cooking/cooking.valid")

(3000, 0.144, 0.06227475854115612)

In [11]:
model.test("cooking/cooking.valid", k=5)

(3000, 0.06886666666666667, 0.14891163327086637)

In [12]:
model.predict("Why not put knives in the dishwasher?", k=5)

(('__label__baking',
  '__label__food-safety',
  '__label__bread',
  '__label__equipment',
  '__label__substitutions'),
 array([0.0705011 , 0.06283189, 0.04247243, 0.03194532, 0.02659538]))

In [13]:
model.predict("Why not put knives in the dishwasher?", k=5)

(('__label__baking',
  '__label__food-safety',
  '__label__bread',
  '__label__equipment',
  '__label__substitutions'),
 array([0.0705011 , 0.06283189, 0.04247243, 0.03194532, 0.02659538]))

In [14]:
!cat cooking/cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > cooking/cooking.preprocessed.txt

In [15]:
!head -n 12404 cooking/cooking.preprocessed.txt > cooking/cooking.train
!tail -n 3000 cooking/cooking.preprocessed.txt > cooking/cooking.valid

In [16]:
model = fasttext.train_supervised(input="cooking/cooking.train")

In [17]:
model.test("cooking/cooking.valid")

(3000, 0.17466666666666666, 0.07553697563788381)

In [18]:
model = fasttext.train_supervised(input="cooking/cooking.train", epoch=25)

In [19]:
model.test("cooking/cooking.valid")

(3000, 0.5146666666666667, 0.2225745999711691)

In [20]:
model = fasttext.train_supervised(input="cooking/cooking.train", lr=1.0, epoch=25)

In [21]:
model.test("cooking/cooking.valid")

(3000, 0.5806666666666667, 0.25111719763586565)

In [22]:
model = fasttext.train_supervised(input="cooking/cooking.train", lr=1.0, epoch=25, wordNgrams=2)

In [23]:
model.test("cooking/cooking.valid")

(3000, 0.6036666666666667, 0.2610638604584114)

In [24]:
model = fasttext.train_supervised(input="cooking/cooking.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')

In [32]:
model = fasttext.train_supervised(input="cooking/cooking.train", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

In [33]:
model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5)

(('__label__baking',
  '__label__bread',
  '__label__equipment',
  '__label__bananas'),
 array([1.00001001, 0.96790934, 0.9615438 , 0.84390509]))

In [35]:
model.test("cooking/cooking.valid", k=-1)

(3000, 0.003146031746031746, 1.0)

In [42]:
import fasttext
import pandas as pd
from sklearn.metrics import f1_score

# 转换为FastText需要的格式
train_df = pd.read_csv('./data/train_set.csv', sep='\t', nrows=15000)
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\t')

import fasttext
model = fasttext.train_supervised('train.csv', lr=1.0, wordNgrams=3, 
                                  verbose=2, minCount=1, epoch=35, loss="hs")

val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text']]
print(f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))
# 0.82

0.8256579322660659


In [29]:
# 本章作业
# 阅读FastText的文档，尝试修改参数，得到更好的分数
# 基于验证集的结果调整超参数，使得模型性能更优

# https://fasttext.cc/



