In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
import joblib

# 데이터 불러오기
os.chdir('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/라벨링 데이터')

data = pd.read_csv('0727_masking_labeling_data.csv')

data.columns

Index(['Unnamed: 0', 'number', 'title', 'title_morphed',
       'title_essential_morphed', 'title_removeNN_essential_morphed',
       'title_eomi_giho_essential_morphed',
       'title_eomi_giho_essential_removeNN_morphed', 'title_morphed_masked',
       'title_essential_morphed_masked',
       'title_removeNN_essential_morphed_masked',
       'title_eomi_giho_essential_morphed_masked',
       'title_eomi_giho_essential_removeNN_morphed_masked', 'name', 'date',
       'answer_state', 'contents', 'text_morphed', 'text_essential_morphed',
       'text_removeNN_essential_morphed', 'text_eomi_giho_essential_morphed',
       'text_eomi_giho_essential_removeNN_morphed', 'text_morphed_masked',
       'text_essential_morphed_masked',
       'text_removeNN_essential_morphed_masked',
       'text_eomi_giho_essential_morphed_masked',
       'text_eomi_giho_essential_removeNN_morphed_masked', 'has_attachment',
       'answer_contents', 'registration_number', 'charge_name', 'charge',
       'answer

In [5]:
len(data[data.aggr == 0])

6990

In [6]:
titles = data.columns[3:13]
texts = data.columns[17:27]

## TfidfVectorizer 마스킹 X, 파라미터 default - best

In [19]:
import json
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
import joblib
import pandas as pd

# 모델 점수를 저장할 경로
model_scores_save_path = 'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/'

# 'scores'와 'cases_best_scores' 키로 초기화된 딕셔너리
All_model_scores = {'scores': {}, 'cases_best_scores': {}}

target_columns = ['aggr', '욕설_모욕', '비꼼_시비', '반복', '요지불명', '저격성 민원']
best_title = ['title_eomi_giho_essential_morphed', 'title_morphed_masked', 'title_eomi_giho_essential_morphed', 'title_essential_morphed', 'title_morphed', 'title_eomi_giho_essential_morphed_masked']
best_text = ['text_eomi_giho_essential_morphed', 'text_morphed_masked', 'text_eomi_giho_essential_morphed', 'text_essential_morphed', 'text_morphed', 'text_eomi_giho_essential_morphed_masked']
best_model = ['LogisticRegression','RidgeClassifier','RidgeClassifier','hard_model','soft_model','SVC']

# 각 타겟 컬럼에 대해 모델 학습 및 평가
for title, text, target_column in zip(best_title, best_text, target_columns):

    print(f'{target_column} 학습 시작...\n')
    All_model_scores[target_column] = {'scores': {}, 'cases_best_scores': {}, 'range_test_scores': {}}
    
    # 데이터 설정
    selected_columns = [title, text, 'aggr', '욕설_모욕', '비꼼_시비', '반복', '요지불명', '저격성 민원']
    data1 = data[selected_columns][data[target_column] == 0]
    data2 = data[selected_columns][data[target_column] == 1]
    data1 = data1[300:300 + len(data2)]
    
    data3 = pd.concat([data1, data2])
    data_contents = data3[title] + ' ' + data3[text]
    data_labeling = data3[target_column]

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(data_contents, data_labeling, stratify=data_labeling, test_size=0.2, random_state=42)

    pipelines = {
        'extra_trees': Pipeline([
            ('vect', TfidfVectorizer()),
            ('et', ExtraTreesClassifier())
        ]),
        'ridge_classifier': Pipeline([
            ('vect', TfidfVectorizer()),
            ('ridge', RidgeClassifier())
        ]),
        'logistic_regression': Pipeline([
            ('vect', TfidfVectorizer()),
            ('lr', LogisticRegression())
        ]),
        'naive_bayes': Pipeline([
            ('vect', TfidfVectorizer()),
            ('nb', MultinomialNB())
        ]),
        'svm': Pipeline([
            ('vect', TfidfVectorizer()),
            ('svm', SVC())
        ]),
        'hard_model': Pipeline([
            ('vect', TfidfVectorizer()),
            ('hard_model', VotingClassifier([('LR', LogisticRegression()), ('ridge', RidgeClassifier()), ('svm', SVC()), ('et', ExtraTreesClassifier()), ('nb', MultinomialNB())], voting='hard'))
        ]),
        'soft_model': Pipeline([
            ('vect', TfidfVectorizer()),
            ('soft_model', VotingClassifier([('LR', LogisticRegression()), ('svm', SVC(probability=True)), ('et', ExtraTreesClassifier()), ('nb', MultinomialNB())], voting='soft'))
        ])
    }

    case_model_scores = {}
    best_model_name = None
    best_test_score = 0
    best_cv_score = 0

    # 모델 훈련 및 평가
    for model_name, pipeline in pipelines.items():
        print(f"{model_name} 학습 중... (타겟: {target_column})")

        X_train_tfidf = pipeline.named_steps['vect'].fit_transform(X_train)
        X_test_tfidf = pipeline.named_steps['vect'].transform(X_test)

        print(f"{model_name} - X_train 크기: {X_train_tfidf.shape}")
        print(f"{model_name} - X_test 크기: {X_test_tfidf.shape}")

        pipeline.fit(X_train, y_train)

        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

        train_score = pipeline.score(X_train, y_train)
        test_score = pipeline.score(X_test, y_test)
        print(f"{model_name} 학습 데이터 점수: {train_score:.4f}")
        print(f"{model_name} 테스트 데이터 점수: {test_score:.4f}")
        print(f"{model_name} 교차 검증 점수: {cv_scores}")
        print(f"{model_name} 교차 검증 평균 점수: {cv_scores.mean():.4f}\n")

        model_save_path = os.path.join('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/', f'{text}_{model_name}_tfidf_model.pkl')
        joblib.dump(pipeline, model_save_path)
        case_model_scores[model_name] = {
            'test_score': test_score,
            'cv_mean_score': cv_scores.mean(),
            'data_range': f'300~{300 + len(data2)}'  # 데이터 범위 추가
        }

        # 최고 점수 갱신
        if test_score > best_test_score:
            best_test_score = test_score
            best_cv_score = cv_scores.mean()
            best_model_name = model_name

    print(f'{text}_케이스 완료\n')

    # 각 케이스별 모델 점수 저장
    All_model_scores[target_column]['scores'][text] = case_model_scores
    # 최고 점수를 가진 모델 정보 저장
    All_model_scores[target_column]['cases_best_scores'][text] = {
        'model_name': best_model_name,
        'test_score': best_test_score,
        'cv_mean_score': best_cv_score,
        'data_range': f'300~{300 + len(data2)}'  # 데이터 범위 추가
    }

    # 전 범위를 테스트하기 위한 설정
    count = 0
    data_len = len(data2)

    while count < len(data[selected_columns][data[target_column] == 0]):

        print(f'훈련 인덱스: {count}~{count + data_len}\n')

        data1 = data[selected_columns][data[target_column] == 0][count:count + data_len]
        data2 = data[selected_columns][data[target_column] == 1]

        if len(data1) == 0 or len(data2) == 0:
            break

        min_len = min(len(data1), len(data2))
        data1 = data1[:min_len]
        data2 = data2[:min_len]

        data3 = pd.concat([data1, data2])
        data_contents = data3[title] + ' ' + data3[text]
        data_labeling = data3[target_column]

        if len(set(data_labeling)) < 2:  # 클래스가 2개 미만이면 넘어가기
            count += data_len
            continue

        if len(data_labeling) < 10:  # 최소 데이터 크기 확인
            print(f"데이터 크기가 너무 작아서 건너뜁니다: {len(data_labeling)}")
            count += data_len
            continue

        X_train, X_test, y_train, y_test = train_test_split(data_contents, data_labeling, stratify=data_labeling, test_size=0.2, random_state=42)

        range_test_scores = {}
        for model_name, pipeline in pipelines.items():
            print(f"{model_name} 전 범위 테스트 중... (타겟: {target_column})")

            X_test_tfidf = pipeline.named_steps['vect'].transform(X_test)
            test_score = pipeline.score(X_test, y_test)
            print(f"{model_name} 테스트 데이터 점수 (전 범위): {test_score:.4f}\n")

            range_test_scores[model_name] = {
                'test_score': test_score,
                'data_range': f'{count}~{count + data_len}'
            }

        All_model_scores[target_column]['range_test_scores'][f'{count}~{count + data_len}'] = range_test_scores
        count += data_len

    # 각 타겟 컬럼별 점수 JSON 파일로 저장
    text_cases_scores_path = os.path.join(model_scores_save_path, f'{target_column}_text_cases_scores.json')
    with open(text_cases_scores_path, 'w', encoding='utf-8') as json_file:
        json.dump(All_model_scores, json_file, ensure_ascii=False, indent=4)

print("모든 모델이 학습되고 저장되었습니다.")



aggr 학습 시작...

extra_trees 학습 중... (타겟: aggr)
extra_trees - X_train 크기: (819, 10466)
extra_trees - X_test 크기: (205, 10466)
extra_trees 학습 데이터 점수: 0.9988
extra_trees 테스트 데이터 점수: 0.8390
extra_trees 교차 검증 점수: [0.89634146 0.84756098 0.89634146 0.8902439  0.8404908 ]
extra_trees 교차 검증 평균 점수: 0.8742

ridge_classifier 학습 중... (타겟: aggr)
ridge_classifier - X_train 크기: (819, 10466)
ridge_classifier - X_test 크기: (205, 10466)
ridge_classifier 학습 데이터 점수: 0.9963
ridge_classifier 테스트 데이터 점수: 0.8780
ridge_classifier 교차 검증 점수: [0.88414634 0.86585366 0.85365854 0.87195122 0.85276074]
ridge_classifier 교차 검증 평균 점수: 0.8657

logistic_regression 학습 중... (타겟: aggr)
logistic_regression - X_train 크기: (819, 10466)
logistic_regression - X_test 크기: (205, 10466)
logistic_regression 학습 데이터 점수: 0.9719
logistic_regression 테스트 데이터 점수: 0.8927
logistic_regression 교차 검증 점수: [0.87195122 0.88414634 0.85365854 0.88414634 0.86503067]
logistic_regression 교차 검증 평균 점수: 0.8718

naive_bayes 학습 중... (타겟: aggr)
naive_bayes - X_trai

In [20]:
import json
import os

# 각 JSON 파일의 경로를 리스트로 저장
json_files = [
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/aggr_text_cases_scores.json',
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/반복_text_cases_scores.json',
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/비꼼_시비_text_cases_scores.json',
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/요지불명_text_cases_scores.json',
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/욕설_모욕_text_cases_scores.json',
    'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/저격성 민원_text_cases_scores.json'
]

# 합칠 모든 데이터를 저장할 딕셔너리
combined_data = {}

# 각 JSON 파일을 읽어서 합치기
for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
        combined_data.update(data)

# 합친 결과를 새로운 JSON 파일로 저장
combined_json_path = 'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/all_range_traing/combined_text_cases_scores.json'
with open(combined_json_path, 'w', encoding='utf-8') as file:
    json.dump(combined_data, file, ensure_ascii=False, indent=4)

print("모든 JSON 파일이 하나로 합쳐졌습니다.")


모든 JSON 파일이 하나로 합쳐졌습니다.


In [6]:
import json
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
import joblib
import pandas as pd

model_scores_save_path = 'C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/scores/add/'

# Initialize the dictionary with 'scores' and 'cases_best_scores' keys
All_model_scores = {'scores': {}, 'cases_best_scores': {}}

target_column = 'aggr'

for title, text in zip(titles, texts):
    
    selected_columns = [f'{title}', f'{text}', 'aggr', '욕설_모욕', '비꼼_시비', '반복', '요지불명', '저격성 민원']
    data1 = data[selected_columns][data[target_column] == 0]
    data2 = data[selected_columns][data[target_column] == 1]
    data1 = data1[300:300+len(data2)]
    data2 = data2

    data3 = pd.concat([data1, data2])
    data_contents = data3[f'{title}'] + ' ' + data3[f'{text}']
    data_labeling = data3[target_column]

    X_train, X_test, y_train, y_test = train_test_split(data_contents, data_labeling, stratify=data_labeling, test_size=0.2, random_state=42)

    pipelines = {
        'extra_trees': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('et', ExtraTreesClassifier())
        ]),
        'ridge_classifier': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('ridge', RidgeClassifier())
        ]),
        'logistic_regression': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('lr', LogisticRegression())
        ]),
        'naive_bayes': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('nb', MultinomialNB())
        ]),
        'svm': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('svm', SVC())
        ]),
        'hard_model': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('hard_model', VotingClassifier([('LR', LogisticRegression()), ('ridge', RidgeClassifier()), ('svm', SVC()), ('et', ExtraTreesClassifier()), ('nb', MultinomialNB())], voting='hard'))
        ]),
        'soft_model': Pipeline([
            ('vect', CountVectorizer()),  # TF-IDF -> Count
            ('soft_model', VotingClassifier([('LR', LogisticRegression()), ('svm', SVC(probability=True)), ('et', ExtraTreesClassifier()), ('nb', MultinomialNB())], voting='soft'))
        ])
    }

    case_model_scores = {}
    best_model_name = None
    best_test_score = 0
    best_cv_score = 0

    # 모델 훈련 및 평가
    for model_name, pipeline in pipelines.items():
        print(f"Training {model_name}...")

        X_train_count = pipeline.named_steps['vect'].fit_transform(X_train)
        X_test_count = pipeline.named_steps['vect'].transform(X_test)

        print(f"{model_name} - X_train shape: {X_train_count.shape}")
        print(f"{model_name} - X_test shape: {X_test_count.shape}")

        pipeline.fit(X_train, y_train)

        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

        train_score = pipeline.score(X_train, y_train)
        test_score = pipeline.score(X_test, y_test)
        print(f"{model_name} Train set score: {train_score:.4f}")
        print(f"{model_name} Test set score: {test_score:.4f}")
        print(f"{model_name} Cross-validation scores: {cv_scores}")
        print(f"{model_name} Cross-validation mean score: {cv_scores.mean():.4f}\n")

        model_save_path = os.path.join('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/', f'{text}_{model_name}_count_model.pkl')
        joblib.dump(pipeline, model_save_path)
        case_model_scores[model_name] = {
            'test_score': test_score,
            'cv_scores': cv_scores.tolist(),  # Convert numpy array to list for JSON serialization
            'cv_mean_score': cv_scores.mean()
        }

        # 최고 점수 갱신
        if test_score > best_test_score:
            best_test_score = test_score
            best_cv_score = cv_scores.mean()
            best_model_name = model_name

    print(f'{text}_case_complete')

    # 각 케이스별 모델 점수 저장
    All_model_scores['scores'][text] = case_model_scores
    # 최고 점수를 가진 모델 정보 저장
    All_model_scores['cases_best_scores'][text] = {
        'model_name': best_model_name,
        'test_score': best_test_score,
        'cv_mean_score': best_cv_score
    }

text_cases_scores_path = os.path.join(model_scores_save_path, f'{target_column}_countvec_text_cases_scores.json')
with open(f'{text_cases_scores_path}', 'w', encoding='utf-8') as json_file:
    json.dump(All_model_scores, json_file, ensure_ascii=False, indent=4)

print("All models have been trained and saved.")


Training extra_trees...
extra_trees - X_train shape: (819, 11590)
extra_trees - X_test shape: (205, 11590)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8146
extra_trees Cross-validation scores: [0.87195122 0.7804878  0.85365854 0.87804878 0.79141104]
extra_trees Cross-validation mean score: 0.8351

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 11590)
ridge_classifier - X_test shape: (205, 11590)
ridge_classifier Train set score: 0.9988
ridge_classifier Test set score: 0.8000
ridge_classifier Cross-validation scores: [0.75       0.80487805 0.7804878  0.81097561 0.79754601]
ridge_classifier Cross-validation mean score: 0.7888

Training logistic_regression...
logistic_regression - X_train shape: (819, 11590)
logistic_regression - X_test shape: (205, 11590)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9988
logistic_regression Test set score: 0.8732
logistic_regression Cross-validation scores: [0.84146341 0.8597561  0.86585366 0.87195122 0.83435583]
logistic_regression Cross-validation mean score: 0.8547

Training naive_bayes...
naive_bayes - X_train shape: (819, 11590)
naive_bayes - X_test shape: (205, 11590)
naive_bayes Train set score: 0.9792
naive_bayes Test set score: 0.8488
naive_bayes Cross-validation scores: [0.85365854 0.81097561 0.81097561 0.85365854 0.82208589]
naive_bayes Cross-validation mean score: 0.8303

Training svm...
svm - X_train shape: (819, 11590)
svm - X_test shape: (205, 11590)
svm Train set score: 0.9365
svm Test set score: 0.8146
svm Cross-validation scores: [0.82926829 0.80487805 0.80487805 0.85365854 0.77300613]
svm Cross-validation mean score: 0.8131

Training hard_model...
hard_model - X_train shape: (819, 11590)
hard_model - X_test shape: (205, 11590)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9988
hard_model Test set score: 0.8537
hard_model Cross-validation scores: [0.85365854 0.84756098 0.85365854 0.8902439  0.84662577]
hard_model Cross-validation mean score: 0.8583

Training soft_model...
soft_model - X_train shape: (819, 11590)
soft_model - X_test shape: (205, 11590)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9963
soft_model Test set score: 0.8732
soft_model Cross-validation scores: [0.88414634 0.85365854 0.8597561  0.89634146 0.84662577]
soft_model Cross-validation mean score: 0.8681

text_morphed_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 10326)
extra_trees - X_test shape: (205, 10326)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8244
extra_trees Cross-validation scores: [0.82926829 0.7804878  0.84756098 0.84146341 0.77300613]
extra_trees Cross-validation mean score: 0.8144

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 10326)
ridge_classifier - X_test shape: (205, 10326)
ridge_classifier Train set score: 0.9988
ridge_classifier Test set score: 0.7610
ridge_classifier Cross-validation scores: [0.78658537 0.82926829 0.76829268 0.81097561 0.79141104]
ridge_classifier Cross-validation mean score: 0.7973

Training logistic_regression...
logistic_regression - X_train shape: (819, 10326)
logi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9988
logistic_regression Test set score: 0.8341
logistic_regression Cross-validation scores: [0.82926829 0.8597561  0.84146341 0.87804878 0.82822086]
logistic_regression Cross-validation mean score: 0.8474

Training naive_bayes...
naive_bayes - X_train shape: (819, 10326)
naive_bayes - X_test shape: (205, 10326)
naive_bayes Train set score: 0.9756
naive_bayes Test set score: 0.8585
naive_bayes Cross-validation scores: [0.84756098 0.81707317 0.80487805 0.82317073 0.81595092]
naive_bayes Cross-validation mean score: 0.8217

Training svm...
svm - X_train shape: (819, 10326)
svm - X_test shape: (205, 10326)
svm Train set score: 0.9377
svm Test set score: 0.8098
svm Cross-validation scores: [0.79268293 0.79268293 0.76829268 0.82926829 0.76687117]
svm Cross-validation mean score: 0.7900

Training hard_model...
hard_model - X_train shape: (819, 10326)
hard_model - X_test shape: (205, 10326)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9988
hard_model Test set score: 0.8634
hard_model Cross-validation scores: [0.83536585 0.8597561  0.85365854 0.86585366 0.81595092]
hard_model Cross-validation mean score: 0.8461

Training soft_model...
soft_model - X_train shape: (819, 10326)
soft_model - X_test shape: (205, 10326)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9951
soft_model Test set score: 0.8585
soft_model Cross-validation scores: [0.84756098 0.85365854 0.8597561  0.87804878 0.85276074]
soft_model Cross-validation mean score: 0.8584

text_essential_morphed_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 2925)
extra_trees - X_test shape: (205, 2925)
extra_trees Train set score: 0.9976
extra_trees Test set score: 0.7854
extra_trees Cross-validation scores: [0.78658537 0.76219512 0.80487805 0.81707317 0.79754601]
extra_trees Cross-validation mean score: 0.7937

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 2925)
ridge_classifier - X_test shape: (205, 2925)
ridge_classifier Train set score: 0.9963
ridge_classifier Test set score: 0.7073
ridge_classifier Cross-validation scores: [0.70731707 0.70121951 0.73170732 0.7195122  0.74846626]
ridge_classifier Cross-validation mean score: 0.7216

Training logistic_regression...
logistic_regression - X_train shape: (819, 2925)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9988
logistic_regression Test set score: 0.8488
logistic_regression Cross-validation scores: [0.84756098 0.86585366 0.8597561  0.88414634 0.8404908 ]
logistic_regression Cross-validation mean score: 0.8596

Training naive_bayes...
naive_bayes - X_train shape: (819, 10466)
naive_bayes - X_test shape: (205, 10466)
naive_bayes Train set score: 0.9780
naive_bayes Test set score: 0.8683
naive_bayes Cross-validation scores: [0.86585366 0.80487805 0.81707317 0.84146341 0.82822086]
naive_bayes Cross-validation mean score: 0.8315

Training svm...
svm - X_train shape: (819, 10466)
svm - X_test shape: (205, 10466)
svm Train set score: 0.9353
svm Test set score: 0.8146
svm Cross-validation scores: [0.82926829 0.78658537 0.7804878  0.85365854 0.78527607]
svm Cross-validation mean score: 0.8071

Training hard_model...
hard_model - X_train shape: (819, 10466)
hard_model - X_test shape: (205, 10466)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9988
hard_model Test set score: 0.8537
hard_model Cross-validation scores: [0.8597561  0.84756098 0.87195122 0.88414634 0.84662577]
hard_model Cross-validation mean score: 0.8620

Training soft_model...
soft_model - X_train shape: (819, 10466)
soft_model - X_test shape: (205, 10466)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9963
soft_model Test set score: 0.8829
soft_model Cross-validation scores: [0.8902439  0.86585366 0.89634146 0.88414634 0.85276074]
soft_model Cross-validation mean score: 0.8779

text_eomi_giho_essential_morphed_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 3066)
extra_trees - X_test shape: (205, 3066)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8146
extra_trees Cross-validation scores: [0.81707317 0.80487805 0.82926829 0.84146341 0.81595092]
extra_trees Cross-validation mean score: 0.8217

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 3066)
ridge_classifier - X_test shape: (205, 3066)
ridge_classifier Train set score: 0.9963
ridge_classifier Test set score: 0.7268
ridge_classifier Cross-validation scores: [0.7195122  0.70121951 0.72560976 0.77439024 0.74233129]
ridge_classifier Cross-validation mean score: 0.7326

Training logistic_regression...
logistic_regression - X_train shape: (

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic_regression Train set score: 0.9866
logistic_regression Test set score: 0.8049
logistic_regression Cross-validation scores: [0.76219512 0.78658537 0.83536585 0.81097561 0.82208589]
logistic_regression Cross-validation mean score: 0.8034

Training naive_bayes...
naive_bayes - X_train shape: (819, 3066)
naive_bayes - X_test shape: (205, 3066)
naive_bayes Train set score: 0.9060
naive_bayes Test set score: 0.7951
naive_bayes Cross-validation scores: [0.79268293 0.82317073 0.79268293 0.7195122  0.79754601]
naive_bayes Cross-validation mean score: 0.7851

Training svm...
svm - X_train shape: (819, 3066)
svm - X_test shape: (205, 3066)
svm Train set score: 0.9035
svm Test set score: 0.7902
svm Cross-validation scores: [0.75609756 0.76829268 0.82317073 0.83536585 0.80981595]
svm Cross-validation mean score: 0.7985

Training hard_model...
hard_model - X_train shape: (819, 3066)
hard_model - X_test shape: (205, 3066)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hard_model Train set score: 0.9902
hard_model Test set score: 0.8146
hard_model Cross-validation scores: [0.76219512 0.80487805 0.84146341 0.84756098 0.82822086]
hard_model Cross-validation mean score: 0.8169

Training soft_model...
soft_model - X_train shape: (819, 3066)
soft_model - X_test shape: (205, 3066)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


soft_model Train set score: 0.9890
soft_model Test set score: 0.8146
soft_model Cross-validation scores: [0.82317073 0.83536585 0.83536585 0.82926829 0.85276074]
soft_model Cross-validation mean score: 0.8352

text_eomi_giho_essential_removeNN_morphed_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 11061)
extra_trees - X_test shape: (205, 11061)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8537
extra_trees Cross-validation scores: [0.84146341 0.79268293 0.84756098 0.85365854 0.79141104]
extra_trees Cross-validation mean score: 0.8254

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 11061)
ridge_classifier - X_test shape: (205, 11061)
ridge_classifier Train set score: 0.9988
ridge_classifier Test set score: 0.7707
ridge_classifier Cross-validation scores: [0.73780488 0.7804878  0.75       0.81707317 0.7791411 ]
ridge_classifier Cross-validation mean score: 0.7729

Training logistic_regression...
logistic_regression - X_t

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9988
logistic_regression Test set score: 0.8634
logistic_regression Cross-validation scores: [0.82926829 0.81097561 0.84756098 0.86585366 0.81595092]
logistic_regression Cross-validation mean score: 0.8339

Training naive_bayes...
naive_bayes - X_train shape: (819, 11061)
naive_bayes - X_test shape: (205, 11061)
naive_bayes Train set score: 0.9768
naive_bayes Test set score: 0.8488
naive_bayes Cross-validation scores: [0.84756098 0.80487805 0.80487805 0.82926829 0.81595092]
naive_bayes Cross-validation mean score: 0.8205

Training svm...
svm - X_train shape: (819, 11061)
svm - X_test shape: (205, 11061)
svm Train set score: 0.9280
svm Test set score: 0.8146
svm Cross-validation scores: [0.82926829 0.78658537 0.79268293 0.84756098 0.77300613]
svm Cross-validation mean score: 0.8058

Training hard_model...
hard_model - X_train shape: (819, 11061)
hard_model - X_test shape: (205, 11061)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9988
hard_model Test set score: 0.8537
hard_model Cross-validation scores: [0.8597561  0.83536585 0.85365854 0.88414634 0.83435583]
hard_model Cross-validation mean score: 0.8535

Training soft_model...
soft_model - X_train shape: (819, 11061)
soft_model - X_test shape: (205, 11061)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9963
soft_model Test set score: 0.8634
soft_model Cross-validation scores: [0.86585366 0.84756098 0.8597561  0.88414634 0.8404908 ]
soft_model Cross-validation mean score: 0.8596

text_morphed_masked_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 9873)
extra_trees - X_test shape: (205, 9873)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8098
extra_trees Cross-validation scores: [0.82317073 0.80487805 0.84756098 0.83536585 0.79141104]
extra_trees Cross-validation mean score: 0.8205

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 9873)
ridge_classifier - X_test shape: (205, 9873)
ridge_classifier Train set score: 0.9988
ridge_classifier Test set score: 0.7463
ridge_classifier Cross-validation scores: [0.75609756 0.7804878  0.75609756 0.81707317 0.7791411 ]
ridge_classifier Cross-validation mean score: 0.7778

Training logistic_regression...
logistic_regression - X_train shape: (819, 9873)
lo

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9976
logistic_regression Test set score: 0.8244
logistic_regression Cross-validation scores: [0.79878049 0.81097561 0.84146341 0.84756098 0.82208589]
logistic_regression Cross-validation mean score: 0.8242

Training naive_bayes...
naive_bayes - X_train shape: (819, 9873)
naive_bayes - X_test shape: (205, 9873)
naive_bayes Train set score: 0.9707
naive_bayes Test set score: 0.8634
naive_bayes Cross-validation scores: [0.83536585 0.82317073 0.78658537 0.80487805 0.79141104]
naive_bayes Cross-validation mean score: 0.8083

Training svm...
svm - X_train shape: (819, 9873)
svm - X_test shape: (205, 9873)
svm Train set score: 0.9243
svm Test set score: 0.7902
svm Cross-validation scores: [0.79268293 0.79268293 0.75609756 0.83536585 0.74233129]
svm Cross-validation mean score: 0.7838

Training hard_model...
hard_model - X_train shape: (819, 9873)
hard_model - X_test shape: (205, 9873)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9976
hard_model Test set score: 0.8537
hard_model Cross-validation scores: [0.81707317 0.81707317 0.85365854 0.8597561  0.81595092]
hard_model Cross-validation mean score: 0.8327

Training soft_model...
soft_model - X_train shape: (819, 9873)
soft_model - X_test shape: (205, 9873)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9939
soft_model Test set score: 0.8683
soft_model Cross-validation scores: [0.83536585 0.8597561  0.85365854 0.8597561  0.82208589]
soft_model Cross-validation mean score: 0.8461

text_essential_morphed_masked_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 2925)
extra_trees - X_test shape: (205, 2925)
extra_trees Train set score: 0.9976
extra_trees Test set score: 0.7902
extra_trees Cross-validation scores: [0.76829268 0.76219512 0.78658537 0.83536585 0.80368098]
extra_trees Cross-validation mean score: 0.7912

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 2925)
ridge_classifier - X_test shape: (205, 2925)
ridge_classifier Train set score: 0.9963
ridge_classifier Test set score: 0.7073
ridge_classifier Cross-validation scores: [0.70731707 0.70121951 0.73170732 0.7195122  0.74846626]
ridge_classifier Cross-validation mean score: 0.7216

Training logistic_regression...
logistic_regression - X_train shape: (819

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regression Train set score: 0.9988
logistic_regression Test set score: 0.8341
logistic_regression Cross-validation scores: [0.82926829 0.84756098 0.8597561  0.88414634 0.83435583]
logistic_regression Cross-validation mean score: 0.8510

Training naive_bayes...
naive_bayes - X_train shape: (819, 10013)
naive_bayes - X_test shape: (205, 10013)
naive_bayes Train set score: 0.9719
naive_bayes Test set score: 0.8585
naive_bayes Cross-validation scores: [0.8597561  0.80487805 0.79878049 0.81707317 0.80368098]
naive_bayes Cross-validation mean score: 0.8168

Training svm...
svm - X_train shape: (819, 10013)
svm - X_test shape: (205, 10013)
svm Train set score: 0.9304
svm Test set score: 0.8195
svm Cross-validation scores: [0.81707317 0.78658537 0.77439024 0.86585366 0.7607362 ]
svm Cross-validation mean score: 0.8009

Training hard_model...
hard_model - X_train shape: (819, 10013)
hard_model - X_test shape: (205, 10013)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

hard_model Train set score: 0.9988
hard_model Test set score: 0.8439
hard_model Cross-validation scores: [0.85365854 0.8597561  0.87195122 0.88414634 0.82822086]
hard_model Cross-validation mean score: 0.8595

Training soft_model...
soft_model - X_train shape: (819, 10013)
soft_model - X_test shape: (205, 10013)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

soft_model Train set score: 0.9963
soft_model Test set score: 0.8439
soft_model Cross-validation scores: [0.87804878 0.8597561  0.8902439  0.8597561  0.8404908 ]
soft_model Cross-validation mean score: 0.8657

text_eomi_giho_essential_morphed_masked_case_complete
Training extra_trees...
extra_trees - X_train shape: (819, 3066)
extra_trees - X_test shape: (205, 3066)
extra_trees Train set score: 0.9988
extra_trees Test set score: 0.8244
extra_trees Cross-validation scores: [0.81097561 0.79878049 0.81707317 0.81707317 0.80368098]
extra_trees Cross-validation mean score: 0.8095

Training ridge_classifier...
ridge_classifier - X_train shape: (819, 3066)
ridge_classifier - X_test shape: (205, 3066)
ridge_classifier Train set score: 0.9963
ridge_classifier Test set score: 0.7268
ridge_classifier Cross-validation scores: [0.7195122  0.70121951 0.72560976 0.77439024 0.74233129]
ridge_classifier Cross-validation mean score: 0.7326

Training logistic_regression...
logistic_regression - X_train s

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic_regression Train set score: 0.9866
logistic_regression Test set score: 0.8049
logistic_regression Cross-validation scores: [0.76219512 0.78658537 0.83536585 0.81097561 0.82208589]
logistic_regression Cross-validation mean score: 0.8034

Training naive_bayes...
naive_bayes - X_train shape: (819, 3066)
naive_bayes - X_test shape: (205, 3066)
naive_bayes Train set score: 0.9060
naive_bayes Test set score: 0.7951
naive_bayes Cross-validation scores: [0.79268293 0.82317073 0.79268293 0.7195122  0.79754601]
naive_bayes Cross-validation mean score: 0.7851

Training svm...
svm - X_train shape: (819, 3066)
svm - X_test shape: (205, 3066)
svm Train set score: 0.9035
svm Test set score: 0.7902
svm Cross-validation scores: [0.75609756 0.76829268 0.82317073 0.83536585 0.80981595]
svm Cross-validation mean score: 0.7985

Training hard_model...
hard_model - X_train shape: (819, 3066)
hard_model - X_test shape: (205, 3066)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hard_model Train set score: 0.9902
hard_model Test set score: 0.8146
hard_model Cross-validation scores: [0.77439024 0.79878049 0.82926829 0.84756098 0.84662577]
hard_model Cross-validation mean score: 0.8193

Training soft_model...
soft_model - X_train shape: (819, 3066)
soft_model - X_test shape: (205, 3066)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


soft_model Train set score: 0.9890
soft_model Test set score: 0.8244
soft_model Cross-validation scores: [0.81707317 0.83536585 0.83536585 0.82317073 0.85889571]
soft_model Cross-validation mean score: 0.8340

text_eomi_giho_essential_removeNN_morphed_masked_case_complete
All models have been trained and saved.


## TfidfVectorizer 명사제거, 마스킹 X, 파라미터 default - 0726

In [38]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
import joblib

# 데이터 불러오기
os.chdir('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/라벨링 데이터')

data1 = pd.read_csv('0726_masking_labeling_data_0.csv')
data2 = pd.read_csv('0726_masking_labeling_data_1.csv')

data1 = data1.rename(columns={'text_remove_noun_morphed':'text_remove_noun_morphed'})
data2 = data2.rename(columns={'text_remove_noun_morphed':'text_remove_noun_morphed'})

data1 = data1.rename(columns={'저격성 민원':'저격성민원_'})
data2 = data2.rename(columns={'저격성 민원':'저격성민원_'})

selected_columns1 = ['text_remove_noun_morphed', 'aggr', '욕설_모욕', '비꼼_시비', '성희롱', '요지불명', '저격성민원_']
data1 = data1[selected_columns1]
data2 = data2[selected_columns1]
data1 = data1[:366]
data2 = data2[data2.aggr == 1]

data3 = pd.concat([data1, data2])
# 텍스트와 라벨 분리
data_contents = data3['text_remove_noun_morphed']
data_labeling = data3['aggr']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data_contents, data_labeling, stratify=data_labeling, test_size=0.2, random_state=42)

# 파이프라인 정의
pipelines = {

    'extra_trees': Pipeline([
        ('vect', TfidfVectorizer()),
        ('et', ExtraTreesClassifier())
    ]),

    'ridge_classifier': Pipeline([
        ('vect', TfidfVectorizer()),
        ('ridge', RidgeClassifier())
    ]),

    'logistic_regression': Pipeline([
        ('vect', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ]),

    'naive_bayes': Pipeline([
        ('vect', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ]),

    'svm': Pipeline([
        ('vect', TfidfVectorizer()),
        ('svm', SVC())
    ]),


    'hard_model' : Pipeline([
        ('vect', TfidfVectorizer()),
        ('hard_model',VotingClassifier([('LR',LogisticRegression()),('ridge',RidgeClassifier()),('svm',SVC()),('et',ExtraTreesClassifier()),('nb',MultinomialNB())],voting='hard'))

    ]),

    'soft_model' : Pipeline([
        ('vect', TfidfVectorizer()),
        ('soft_model',VotingClassifier([('LR',LogisticRegression()),('svm',SVC(probability=True)),('et',ExtraTreesClassifier()),('nb',MultinomialNB())],voting='soft'))

    ])
    

}

# 모델 훈련 및 평가
for model_name, pipeline in pipelines.items():
    print(f"Training {model_name}...")

    # 텍스트 데이터를 TF-IDF 벡터로 변환
    X_train_tfidf = pipeline.named_steps['vect'].fit_transform(X_train)
    X_test_tfidf = pipeline.named_steps['vect'].transform(X_test)
    
    # 차원 출력
    print(f"{model_name} - X_train shape: {X_train_tfidf.shape}")
    print(f"{model_name} - X_test shape: {X_test_tfidf.shape}")
    
    # 모델 훈련
    pipeline.fit(X_train, y_train)
    
    # 교차검증 점수 계산
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    
    # 성능 출력
    train_score = pipeline.score(X_train, y_train)
    test_score = pipeline.score(X_test, y_test)
    print(f"{model_name} Train set score: {train_score:.4f}")
    print(f"{model_name} Test set score: {test_score:.4f}")
    print(f"{model_name} Cross-validation scores: {cv_scores}")
    print(f"{model_name} Cross-validation mean score: {cv_scores.mean():.4f}\n")
    
    # 모델 저장
    model_save_path = os.path.join('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/', f'aggr_{model_name}_tfidf_nonemasking_model.pkl')
    joblib.dump(pipeline, model_save_path)

print("All models have been trained and saved.")

Training extra_trees...
extra_trees - X_train shape: (585, 2620)
extra_trees - X_test shape: (147, 2620)
extra_trees Train set score: 0.9983
extra_trees Test set score: 0.7483
extra_trees Cross-validation scores: [0.75213675 0.74358974 0.73504274 0.7008547  0.73504274]
extra_trees Cross-validation mean score: 0.7333

Training ridge_classifier...
ridge_classifier - X_train shape: (585, 2620)
ridge_classifier - X_test shape: (147, 2620)
ridge_classifier Train set score: 0.9846
ridge_classifier Test set score: 0.8027
ridge_classifier Cross-validation scores: [0.76923077 0.73504274 0.70940171 0.67521368 0.70940171]
ridge_classifier Cross-validation mean score: 0.7197

Training logistic_regression...
logistic_regression - X_train shape: (585, 2620)
logistic_regression - X_test shape: (147, 2620)
logistic_regression Train set score: 0.9248
logistic_regression Test set score: 0.7823
logistic_regression Cross-validation scores: [0.74358974 0.70940171 0.73504274 0.73504274 0.70940171]
logistic_

## TfidfVectorizer 마스킹 X, 파라미터 default - best 그리드 서치

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
import joblib

# 데이터 불러오기
os.chdir('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/라벨링 데이터')

data = pd.read_csv('0727_masking_labeling_data.csv')

selected_columns1 = ['text_morphed','title_morphed', 'aggr', '욕설_모욕', '비꼼_시비', '성희롱', '요지불명', '저격성 민원']
data1 = data[selected_columns1][data.aggr == 0]
data2 = data[selected_columns1][data.aggr == 1]
data1 = data1[300:812]
data2 = data2


data3 = pd.concat([data1, data2])
# 텍스트와 라벨 분리
data_contents = data3['text_morphed'] + ' ' + data3['title_morphed']
# 
# 원래의 'text_morphed_'와 'title' 컬럼 삭제
data_labeling = data3['aggr']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data_contents, data_labeling, stratify=data_labeling, test_size=0.2, random_state=42)

# 파이프라인 정의
pipelines = {
    'extra_trees': Pipeline([
        ('vect', TfidfVectorizer()),
        ('et', ExtraTreesClassifier())
    ]),
    'ridge_classifier': Pipeline([
        ('vect', TfidfVectorizer()),
        ('ridge', RidgeClassifier())
    ]),
    'logistic_regression': Pipeline([
        ('vect', TfidfVectorizer()),
        ('lr', LogisticRegression())
    ]),
    'naive_bayes': Pipeline([
        ('vect', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ]),
    'svm': Pipeline([
        ('vect', TfidfVectorizer()),
        ('svm', SVC())
    ]),
    'hard_model' : Pipeline([
        ('vect', TfidfVectorizer()),
        ('hard_model',VotingClassifier([('LR',LogisticRegression()),('ridge',RidgeClassifier()),('svm',SVC()),('et',ExtraTreesClassifier()),('nb',MultinomialNB())],voting='hard'))

    ]),

    'soft_model' : Pipeline([
        ('vect', TfidfVectorizer()),
        ('soft_model',VotingClassifier([('LR',LogisticRegression()),('svm',SVC(probability=True)),('et',ExtraTreesClassifier()),('nb',MultinomialNB())],voting='soft'))
    ])
}

# 하이퍼파라미터 그리드 설정
param_grids = {
    'extra_trees': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'et__n_estimators': [100],
        'et__max_features': ['auto', 'sqrt'],
        'et__max_depth': [None, 50],
    },
    'ridge_classifier': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'ridge__alpha': [0.1, 1, 10],
    },
    'logistic_regression': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'lr__C': [0.1, 1, 10],
        'lr__penalty': ['l1', 'l2'],
        'lr__solver': ['liblinear'],
    },
    'naive_bayes': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'nb__alpha': [0.1, 1, 10],
    },
    'svm': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'svm__C': [0.1, 1, 10],
        'svm__kernel': ['linear', 'rbf'],
        'svm__gamma': ['scale'],
    },
    'hard_model': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'hard_model__LR__C': [0.1, 1],
        'hard_model__ridge__alpha': [0.1, 1],
        'hard_model__svm__C': [0.1, 1],
    },
    'soft_model': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.9, 1.0],
        'vect__min_df': [1, 2],
        'vect__max_features': [None, 1000],
        'soft_model__LR__C': [0.1, 1],
        'soft_model__svm__C': [0.1, 1],
        'soft_model__et__n_estimators': [100],
        'soft_model__nb__alpha': [0.1, 1],
    },
}

# 모델 훈련 및 평가
for model_name, pipeline in pipelines.items():
    print(f"Training {model_name}...")

    # 그리드 서치 설정
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    
    # 그리드 서치 훈련
    grid_search.fit(X_train, y_train)
    
    # 최적 하이퍼파라미터 출력
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # 최적 모델 평가
    best_model = grid_search.best_estimator_
    train_score = best_model.score(X_train, y_train)
    test_score = best_model.score(X_test, y_test)
    
    # 교차 검증 점수 계산
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    
    # 성능 출력
    print(f"{model_name} Train set score: {train_score:.4f}")
    print(f"{model_name} Test set score: {test_score:.4f}")
    print(f"{model_name} Cross-validation scores: {cv_scores}")
    print(f"{model_name} Cross-validation mean score: {cv_scores.mean():.4f}\n")
    
    # 모델 저장
    model_save_path = os.path.join('C:/Users/USER/Desktop/핵심역량 프로젝트/데이터/모델/', f'aggr_{model_name}_tfidf_nonemasking_model.pkl')
    joblib.dump(best_model, model_save_path)

print("All models have been trained and saved.")



Training extra_trees...


KeyboardInterrupt: 