In [None]:
import os
print(os.getcwd())

In [None]:
# 套件
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from string import punctuation
stop = stopwords.words('english')

In [None]:
# 讀取 train data
train = pd.read_csv('train_values.csv')
train_label = pd.read_csv('train_labels.csv')
test = pd.read_csv('test_values.csv')

In [None]:
# 檢查數據
train.shape , train_label.shape, test.shape # 筆數正確

In [None]:
# 拿高分的資料清除位置
def rm_tags(text):
    re_tag = r'<[^>]+>'
    text = text.lower()
    
    # 連字
    text = re.sub(re_tag, '', text)
    text = re.sub(r"didn't","did not", text)
    text = re.sub(r"haven't","have not", text)
    text = re.sub(r"can't","can not", text)
    text = re.sub(r"it's","it is", text)
    text = re.sub(r"won't","will not", text)
    text = re.sub(r"wouldn't","would not",text)
    text = re.sub(r"what’s", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    
    # remove non-ascii characters
    text = ''.join(character for character in text if ord(character) < 128)    
    
    # 請繼續加入    
    text = re.sub(r"[&-/()),%']", ' ', text) #這特殊符號都去除
    text = re.sub(r"\.", ' ', text) # 點換成一個空白
    text = re.sub(r"\s+", ' ', text) #多空白換成一個空白
    
    text = ' '.join([c for c in text.split() if c not in stop])    
    text = ' '.join([c for c in text.split() if not c.isdigit()])
    text = ' '.join([c for c in text.split() if c not in punctuation])
    
    return text

In [None]:
# 原始資料
train['doc_text'][0]

In [None]:
# 驗證資料處理
rm_tags(train['doc_text'][0])

In [None]:
%%time
# ********************************************************* 請注意
train['doc_text_cleaned'] = train.doc_text.apply(rm_tags)
test['doc_text_cleaned'] = test.doc_text.apply(rm_tags)
#train, test =  pd.read_pickle("./train20191110.pkl"),  pd.read_pickle("./test20191110.pkl")

In [None]:
# 儲存已經清除完成數據 *********************************請注意
train.to_pickle("./train20191110.pkl")
test.to_pickle("./test20191110.pkl")
print('Save to pickle done!!')

In [None]:
# 找出文字出現的頻率
from nltk import FreqDist
sample = train.loc[:1000, 'doc_text_cleaned']
# for k in sample.values.tolist():
#     for j in k.split():
#         print(j)
#[j for k in sample.values.tolist() for j in k.split() ]
plt.figure(figsize=(16,6) )
FreqDistSend = FreqDist( [j for k in sample.values.tolist() for j in k.split() ]  )
FreqDistSend.plot(80)
plt.show()

In [None]:
categories = train_label.iloc[:, 1:].columns.values.tolist()

In [None]:
# 注意這邊是前幾 N 筆
# y 因為有row_id 要移除 所有 iloc[:, 1:]
top = 100 #*************************************************************注意
X, y = train['doc_text_cleaned'][:] , train_label.iloc[:, 1:]
# X : Series
# y : DataFrame

In [None]:
# 準備建立模型 - 統計模型
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer  # 找出所有的單字
from sklearn.feature_extraction.text import TfidfVectorizer # 加上權重
from sklearn.feature_extraction.text import TfidfTransformer # 加上權重
from sklearn.linear_model import LogisticRegression # 多標籤分類
from sklearn.pipeline import Pipeline # pipline上述的作業
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# *****************************ngram_range=(1, N) N可以是 3,4,5,6
pipeline = Pipeline([
    ('vect', CountVectorizer( ngram_range=(1, 3)  )),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(multi_class='ovr'))  ),
])

In [None]:
%%time
# 給資料進行訓練 Machine Learning - Statistics Model 
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
prediction = pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test.values, prediction)))

report = metrics.classification_report(y_test, prediction, target_names=categories)
print(report)

In [None]:
####################
# 準備submission資料
####################
X_submission = test.doc_text_cleaned
X_submission.shape

In [None]:
# 預測資料  一定要全部
prediction_test = pipeline.predict(X_submission[:])
print(prediction_test)

In [None]:
# 將預測結果轉換成csv
print(prediction_test.shape)
print(len(categories))
final = pd.DataFrame(prediction_test, columns=categories)

In [None]:
# 匯出
import os
output = final.reset_index()
output.columns =['row_id']+categories
output.to_csv('submission_format_20191110_3.csv', sep=',', index=None)
print(os.getcwd())

In [None]:
# 分析final submission
plt.figure(figsize=(10,10))
np.sum(final[categories] , axis=0).plot(kind='barh')
plt.show()