In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib
import sys
sys.path.append("..")
import warnings
warnings.filterwarnings('ignore')

from ml_editor.data_processing import (
    format_raw_df,
    get_split_by_author
)
from ml_editor.model_v2 import (
    add_char_count_features,
    get_word_stats,
    get_sentiment_score,
    POS_NAMES,
    get_question_score_from_input
)
from ml_editor.model_evaluation import (
    get_feature_importance,
    get_roc_plot,
    get_confusion_matrix_plot,
    get_calibration_plot
)
%load_ext autoreload
%autoreload 2
np.random.seed(35)

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)
df = format_raw_df(df.copy())

df = df.loc[df["is_question"]].copy()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/johanjun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="")


In [3]:
df = add_char_count_features(df.copy())
df = get_word_stats(df.copy())
df = get_sentiment_score(df.copy())

100%|██████████| 7971/7971 [03:50<00:00, 34.63it/s]
100%|██████████| 7971/7971 [00:11<00:00, 679.67it/s]


In [4]:
feature_arr = ["num_questions", 
               "num_periods",
               "num_commas",
               "num_exclam",
               "num_quotes",
               "num_colon",
               "num_stops",
               "num_semicolon",
               "num_words",
               "num_chars",
               "num_diff_words",
               "avg_word_len",
               "polarity"
              ]
feature_arr.extend(POS_NAMES.keys())

In [5]:
# 추가된 특성이 있으므로 데이터를 다시 분할합니다.
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=40)

In [8]:
df[feature_arr].head()*100

Unnamed: 0_level_0,num_questions,num_periods,num_commas,num_exclam,num_quotes,num_colon,num_stops,num_semicolon,num_words,num_chars,...,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,56.818182,56.818182,28.409091,0.0,0.0,0.0,1022.727273,0.0,2073.863636,35200,...,3.409091,0.0,1.136364,2.272727,0.0,1.988636,0.0,0.0,3.409091,0.0
2,90.634441,30.21148,60.422961,0.0,60.422961,0.0,1087.613293,0.0,2024.169184,33100,...,4.229607,0.0,0.0,1.510574,0.0,2.416918,0.302115,0.0,1.510574,0.0
3,175.438596,87.719298,87.719298,0.0,0.0,0.0,1491.22807,0.0,2719.298246,11400,...,2.631579,0.877193,0.0,6.140351,0.0,3.508772,0.0,0.0,5.263158,0.0
5,70.257611,46.838407,117.096019,0.0,0.0,0.0,843.091335,0.0,2154.566745,42700,...,3.981265,0.0,1.17096,0.702576,0.0,2.810304,0.234192,0.234192,3.278689,0.0
7,135.135135,67.567568,67.567568,0.0,0.0,0.0,945.945946,0.0,2027.027027,14800,...,2.702703,0.0,0.0,2.702703,1.351351,2.702703,0.675676,0.0,2.027027,0.0


In [9]:
def get_feature_vector_and_label(df, feature_names):
    """
    벡터 특성과 특성 이름으로 입력과 출력 벡터를 생성합니다.
    :param df: 입력 DataFrame
    :param feature_names: (벡터가 아닌) 특성 열의 이름 
    :return: 특성 배열과 레이블 배열
    """
    features = df[feature_names].astype(float)
    labels = df["Score"] > df["Score"].median()
    return features, labels

X_train, y_train = get_feature_vector_and_label(train_df, feature_arr)
X_test, y_test = get_feature_vector_and_label(test_df, feature_arr)

In [10]:

clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)
y_predicted_proba = clf.predict_proba(X_test)

In [11]:
def get_metrics(y_test, y_predicted):  
    # 진짜 양성 / (진짜 양성 + 가짜 양성)
    precision = precision_score(y_test, y_predicted, pos_label=True,
                                    average='binary')             
    # 진짜 양성 / (진짜 양성 + 가짜 음성)
    recall = recall_score(y_test, y_predicted, pos_label=True,
                              average='binary')
    
    # 정밀도와 재현율의 조화 평균
    f1 = f1_score(y_test, y_predicted, pos_label=True, average='binary')
    
    # 진짜 양성 + 진짜 음성 / 전체
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1


# 훈련 정확도
# https://datascience.stackexchange.com/questions/13151/randomforestclassifier-oob-scoring-method 참조
y_train_pred = np.argmax(clf.oob_decision_function_,axis=1)

accuracy, precision, recall, f1 = get_metrics(y_train, y_train_pred)
print("훈련 정확도 = %.3f, 정밀도 = %.3f, 재현율 = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

훈련 정확도 = 0.571, 정밀도 = 0.542, 재현율 = 0.427, f1 = 0.478


In [12]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("검증 정확도 = %.3f, 정밀도 = %.3f, 재현율 = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))


검증 정확도 = 0.589, 정밀도 = 0.602, 재현율 = 0.468, f1 = 0.527


In [13]:
model_path = Path("../models/model_3.pkl")
joblib.dump(clf, model_path)

['../models/model_3.pkl']

In [14]:
all_feature_names = np.array(feature_arr)

In [16]:
k = 20
print("상위 %s개 중요도:\n" % k)
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[:k]]))

print("\n하위 %s개 중요도:\n" % k)
print('\n'.join(["%s: %.2g" % (tup[0], tup[1]) for tup in get_feature_importance(clf, all_feature_names)[-k:]]))

상위 20개 중요도:

num_chars: 0.049
num_periods: 0.048
num_questions: 0.047
ADJ: 0.047
num_diff_words: 0.046
ADV: 0.044
DET: 0.043
ADP: 0.043
PRON: 0.043
AUX: 0.043
PUNCT: 0.042
NOUN: 0.042
VERB: 0.041
num_commas: 0.041
PART: 0.04
num_stops: 0.04
num_words: 0.04
polarity: 0.039
avg_word_len: 0.038
SCONJ: 0.035

하위 20개 중요도:

PUNCT: 0.042
NOUN: 0.042
VERB: 0.041
num_commas: 0.041
PART: 0.04
num_stops: 0.04
num_words: 0.04
polarity: 0.039
avg_word_len: 0.038
SCONJ: 0.035
PROPN: 0.03
NUM: 0.026
num_colon: 0.022
num_quotes: 0.019
INTJ: 0.014
SYM: 0.014
X: 0.01
num_semicolon: 0.0075
num_exclam: 0.0066
CONJ: 0
