# ML Pipeline 
按照如下的指导要求，搭建你的机器学习管道。
### 1. 导入与加载
- 导入 Python 库
- 使用 [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html) 从数据库中加载数据集
- 定义特征变量X 和目标变量 Y

In [1]:
# import libraries
import pandas as pd
import re
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
# nltk.download(["punkt", "wordnet", "averaged_perceptron_tagger"])

In [2]:
# load data from database，读取本地的sqlite数据库
engine = create_engine('sqlite:///../data/disaster_response.db')
df = pd.read_sql("SELECT * FROM Message", engine)

# 输入因子为消息文本
X = df["message"]

# 预测结果是36个类别列
y = df.iloc[:, 4:]

In [3]:
# 查看文本例子
X.head(10)

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
5               Information about the National Palace-
6                       Storm at sacred heart of jesus
7    Please, we need tents and water. We are in Sil...
8      I would like to receive the messages, thank you
9    I am in Croix-des-Bouquets. We have health iss...
Name: message, dtype: object

In [4]:
# 查看类别列
y.head(10)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
7,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### 2. 编写分词函数，开始处理文本

In [5]:
# 对文本进行英文单词分词和词形还原
def tokenize(text):
    
    text = text.lower().strip() # 统一转换为小写字母
    # text = re.sub(r"[^a-z0-9]", " ", text) 不能这么简单粗暴，有时候标点符号是有意义的
    lemmatizer = WordNetLemmatizer()
    
    clean_words = []
    for word, tag in pos_tag(word_tokenize(text)):
        # 对每个单词，根据词性，进行词形还原
        if word == ".": # 单独的标点符号没有意义
            continue
        if tag.startswith("NN"):
            clean_words.append(lemmatizer.lemmatize(word, pos="n"))
        elif tag.startswith("VB"):
            clean_words.append(lemmatizer.lemmatize(word, pos="v"))
        elif tag.startswith("JJ"):
            clean_words.append(lemmatizer.lemmatize(word, pos="a"))
        elif tag.startswith("R"):
            clean_words.append(lemmatizer.lemmatize(word, pos="r"))
        else:
            clean_words.append(word)

    return clean_words

In [6]:
# 测试一下分词和词型还原功能
print(X[3])
print(tokenize(X[3]))

UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
['un', 'report', 'leogane', '80-90', 'destroyed', 'only', 'hospital', 'st.', 'croix', 'functioning', 'need', 'supply', 'desperately']


### 3. 创建机器学习管道 
这个机器学习管道应该接收 `message` 列作输入，输出分类结果，分类结果属于该数据集中的 36 个类。你会发现 [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) 在预测多目标变量时很有用。

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. 训练管道
- 将数据分割成训练和测试集
- 训练管道

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                  

### 5. 测试模型
报告数据集中每个输出类别的 f1 得分、准确度和召回率。你可以对列进行遍历，并对每个元素调用 sklearn 的 `classification_report`。

In [9]:
y_pred = pipeline.predict(X_test)
for i, col in enumerate(y_test.columns):
    print("--------category: {}--------".format(col))
    print(classification_report(y_test[col], y_pred[:, i]))

--------category: related--------
              precision    recall  f1-score   support

           0       0.64      0.36      0.46      1575
           1       0.82      0.94      0.87      4932
           2       0.50      0.32      0.39        47

    accuracy                           0.79      6554
   macro avg       0.65      0.54      0.57      6554
weighted avg       0.77      0.79      0.77      6554

--------category: request--------
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      5448
           1       0.83      0.37      0.51      1106

    accuracy                           0.88      6554
   macro avg       0.86      0.68      0.72      6554
weighted avg       0.88      0.88      0.86      6554

--------category: offer--------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6532
           1       0.00      0.00      0.00        22

    accuracy                

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.84      0.96      0.90      4757
           1       0.83      0.51      0.63      1797

    accuracy                           0.84      6554
   macro avg       0.84      0.74      0.76      6554
weighted avg       0.84      0.84      0.82      6554

--------category: floods--------
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      6002
           1       0.87      0.22      0.35       552

    accuracy                           0.93      6554
   macro avg       0.90      0.61      0.66      6554
weighted avg       0.93      0.93      0.91      6554

--------category: storm--------
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      5967
           1       0.79      0.31      0.45       587

    accuracy                           0.93      6554
   macro avg       0.86      0.65      0.71      6554
weighted 

### 6. 优化模型
使用网格搜索来找到最优的参数组合。 

In [45]:
# 查看管道模型参数
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x7f36cd9090d0>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=None,

In [59]:
X_train.shape, y_train.shape

((19662,), (19662, 36))

In [60]:
# 网格搜索运算量非常大，暂时只挑部分参数尝试一下
parameters = {
    # 'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_df': (0.75, 1.0),
    'vect__max_features': (None, 5000),
    'tfidf__use_idf': (True, False)
}

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=1)
cv.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 40.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__max_df': (0.75, 1.0), 'vect__max_features': (None, 5000), 'tfidf__use_idf': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

### 7. 测试模型
打印微调后的模型的精确度、准确率和召回率。  

因为本项目主要关注代码质量、开发流程和管道技术，所有没有模型性能指标的最低要求。但是，微调模型提高精确度、准确率和召回率可以让你的项目脱颖而出——特别是让你的简历更出彩。

In [62]:
cv.best_params_

{'tfidf__use_idf': False, 'vect__max_df': 0.75, 'vect__max_features': 5000}

In [63]:
y_pred = cv.predict(X_test)
for i, col in enumerate(y_test.columns):
    print("--------category: {}--------".format(col))
    print(classification_report(y_test[col], y_pred[:, i]))

--------category: related--------
             precision    recall  f1-score   support

          0       0.63      0.35      0.45      1585
          1       0.81      0.93      0.87      4926
          2       0.43      0.23      0.30        43

avg / total       0.77      0.79      0.77      6554

--------category: request--------
             precision    recall  f1-score   support

          0       0.89      0.98      0.94      5463
          1       0.83      0.41      0.55      1091

avg / total       0.88      0.89      0.87      6554

--------category: offer--------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6527
          1       0.00      0.00      0.00        27

avg / total       0.99      1.00      0.99      6554

--------category: aid_related--------
             precision    recall  f1-score   support

          0       0.75      0.86      0.81      3935
          1       0.74      0.58      0.65      2619

avg

  'precision', 'predicted', average, warn_for)


### 8. 继续优化模型，比如：
* 尝试其他的机器学习算法
* 尝试除 TF-IDF 外其他的特征

In [65]:
# 我尝试使用朴素贝叶斯代替随机森林，看看效果
from sklearn.naive_bayes import MultinomialNB
pipeline_new = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(MultinomialNB()))
])
pipeline_new.fit(X_train, y_train)
y_pred = pipeline_new.predict(X_test)
for i, col in enumerate(y_test.columns):
    print("--------category: {}--------".format(col))
    print(classification_report(y_test[col], y_pred[:, i]))

  self.class_log_prior_ = (np.log(self.class_count_) -


--------category: related--------
             precision    recall  f1-score   support

          0       0.86      0.04      0.07      1585
          1       0.76      1.00      0.86      4926
          2       0.00      0.00      0.00        43

avg / total       0.78      0.76      0.67      6554

--------category: request--------
             precision    recall  f1-score   support

          0       0.85      1.00      0.92      5463
          1       0.85      0.14      0.24      1091

avg / total       0.85      0.85      0.80      6554

--------category: offer--------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6527
          1       0.00      0.00      0.00        27

avg / total       0.99      1.00      0.99      6554

--------category: aid_related--------
             precision    recall  f1-score   support

          0       0.77      0.87      0.82      3935
          1       0.76      0.61      0.68      2619

avg

  'precision', 'predicted', average, warn_for)


### 9. 导出模型为 pickle file

In [10]:
# 将模型导出到文件
import pickle
pickle.dump(pipeline, open("./classifier.pkl", "wb"))

In [12]:
# 测试一下加载，确保模型可以加载，预测结果和之前一致
loaded_model = pickle.load(open("./classifier.pkl", "rb"))
y_pred = loaded_model.predict(X_test)
for i, col in enumerate(y_test.columns):
    print("--------category: {}--------".format(col))
    print(classification_report(y_test[col], y_pred[:, i]))

--------category: related--------
              precision    recall  f1-score   support

           0       0.64      0.36      0.46      1575
           1       0.82      0.94      0.87      4932
           2       0.50      0.32      0.39        47

    accuracy                           0.79      6554
   macro avg       0.65      0.54      0.57      6554
weighted avg       0.77      0.79      0.77      6554

--------category: request--------
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      5448
           1       0.83      0.37      0.51      1106

    accuracy                           0.88      6554
   macro avg       0.86      0.68      0.72      6554
weighted avg       0.88      0.88      0.86      6554

--------category: offer--------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6532
           1       0.00      0.00      0.00        22

    accuracy                

              precision    recall  f1-score   support

           0       0.94      0.99      0.96      5967
           1       0.79      0.31      0.45       587

    accuracy                           0.93      6554
   macro avg       0.86      0.65      0.71      6554
weighted avg       0.92      0.93      0.92      6554

--------category: fire--------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6483
           1       0.00      0.00      0.00        71

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

--------category: earthquake--------
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5947
           1       0.89      0.57      0.69       607

    accuracy                           0.95      6554
   macro avg       0.92      0.78      0.83      6554
weight

### 10. Use this notebook to complete `train.py`
使用资源 (Resources)文件里附带的模板文件编写脚本，运行上述步骤，创建一个数据库，并基于用户指定的新数据集输出一个模型。