In [1]:
# 套件
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from string import punctuation
stop = stopwords.words('english')

In [2]:
# 讀取 train data
train = pd.read_csv('train_values.csv')
train_label = pd.read_csv('train_labels.csv')
test = pd.read_csv('test_values.csv')

In [3]:
# 檢查數據
train.shape , train_label.shape, test.shape # 筆數正確

((18687, 2), (18687, 25), (18699, 2))

In [4]:
# 拿高分的資料清除位置
def rm_tags(text):
    re_tag = r'<[^>]+>'
    text = text.lower()
    
    text = re.sub(re_tag, '', text)
    text = re.sub(r"didn't","did not", text)
    text = re.sub(r"haven't","have not", text)
    text = re.sub(r"can't","can not", text)
    text = re.sub(r"it's","it is", text)
    text = re.sub(r"won't","will not", text)
    text = re.sub(r"wouldn't","would not",text)
    # 請繼續加入
    
    text = re.sub(r"[&-/()),%']", ' ', text) #這特殊符號都去除
    text = re.sub(r"\.", ' ', text) # 點換成一個空白
    text = re.sub(r"\s+", ' ', text) #多空白換成一個空白
    
    text = ' '.join([c for c in text.split() if c not in stop])    
    text = ' '.join([c for c in text.split() if not c.isdigit()])
    text = ' '.join([c for c in text.split() if c not in punctuation])
    return text

In [5]:
# 原始資料
#train['doc_text'][0]

In [6]:
# 驗證資料處理
#rm_tags(train['doc_text'][0])

In [7]:
%%time
train['doc_text_cleaned'] = train.doc_text.apply(rm_tags)
test['doc_text_cleaned'] = test.doc_text.apply(rm_tags)

Wall time: 5min 31s


In [8]:
train.to_pickle("./train.pkl")
test.to_pickle("./test.pkl")
print('Save to pickle done!!')

Save to pickle done!!


In [68]:
# 驗證
train_2, test_2 =  pd.read_pickle("./train.pkl"),  pd.read_pickle("./test.pkl")
train_2.shape, test_2.shape

((18687, 3), (18699, 3))

In [10]:
# 驗證
train_2[:3] , test_2[:3]

(   row_id                                           doc_text  \
 0       0  84327 v2\nThe findings, interpretations, and c...   
 1       1                                                ...   
 2       2                             78156\n\n\n\n\nRisk...   
 
                                     doc_text_cleaned  
 0  v2 findings interpretations conclusions expres...  
 1  decpg daily economics financial market comment...  
 2  risk taking: corporate governance perspective ...  ,
    row_id                                           doc_text  \
 0       0                                                ...   
 1       1   EARLY LEARNING PARTNERSHIP\n\n\n\n\n E L P\n ...   
 2       2                                                ...   
 
                                     doc_text_cleaned  
 0  roma inclusion smart economics illustrations b...  
 1  early learning partnership e l p september ear...  
 2  wps5739 policy research working paper entrepre...  )

In [11]:
# 請刪除 不要的變數
del train_2
del test_2

In [50]:
categories = train_label.iloc[:, 1:].columns.values.tolist()

In [89]:
# 注意這邊是前幾 N 筆
# y 因為有row_id 要移除 所有 iloc[:, 1:]
top = 100
X, y = train['doc_text_cleaned'][:top] , train_label.iloc[:top, 1:]
# X : Series
# y : DataFrame

In [97]:
# 準備建立模型 - 統計模型
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer  # 找出所有的單字
from sklearn.feature_extraction.text import TfidfVectorizer # 加上權重
from sklearn.linear_model import LogisticRegression # 多標籤分類
from sklearn.pipeline import Pipeline # pipline上述的作業
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression(multi_class='ovr' , solver='sag'))  ),
])

In [98]:
# 給資料進行訓練 Machine Learning - Statistics Model 
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=1.0,
                                                                  class_weight=None,
                      

In [106]:
from sklearn.metrics import accuracy_score
prediction = pipeline.predict(X_test)
#prediction , y_test.values.tolist()
print('Test accuracy is {}'.format(accuracy_score(y_test.values, prediction)))

report = metrics.classification_report(y_test, prediction, target_names=categories)
print(report)

Test accuracy is 0.1
                                            precision    recall  f1-score   support

information_and_communication_technologies       0.00      0.00      0.00         3
                                governance       0.00      0.00      0.00         0
                         urban_development       0.00      0.00      0.00         0
                       law_and_development       0.00      0.00      0.00         2
                 public_sector_development       0.00      0.00      0.00         3
                               agriculture       0.00      0.00      0.00         2
         communities_and_human_settlements       0.00      0.00      0.00         0
       health_and_nutrition_and_population       0.00      0.00      0.00         3
                   culture_and_development       0.00      0.00      0.00         0
              social_protections_and_labor       0.00      0.00      0.00         0
         international_economics_and_trade       0.00 

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
