In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

df_train = pd.read_csv('train.feature.txt')
X_train = df_train.drop(columns=['category', 'title'])
y_train = df_train['category']
df_valid = pd.read_csv('valid.feature.txt')
X_valid = df_valid.drop(columns=['category', 'title'])
y_valid = df_valid['category']
df_test = pd.read_csv('test.feature.txt')
X_test = df_test.drop(columns=['category', 'title'])
y_test = df_test['category']

def objective_lr(trial):
    random_state = trial.suggest_int('random_state', 0, 10)
    l1_ratio = trial.suggest_float('l1_ratio', 0, 1, step=0.1)
    model_lr = LogisticRegression(penalty='elasticnet', random_state=random_state, solver='saga', max_iter=1000,
                                  l1_ratio=l1_ratio)
    model_lr.fit(X_train, y_train)
    return accuracy_score(y_valid, model_lr.predict(X_valid))

def objective_rfc(trial):
    n_estimators = trial.suggest_int('n_estimators', 90, 110)
    max_depth = trial.suggest_int('max_depth', 1, 5)
    random_state = trial.suggest_int('random_state', 0, 10)
    model_rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model_rfc.fit(X_train, y_train)
    return accuracy_score(y_valid, model_rfc.predict(X_valid))

study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0), direction='maximize')
study.optimize(objective_lr, n_trials=20)
model_lr = LogisticRegression(penalty='elasticnet', random_state=study.best_params['random_state'], solver='saga',
                              max_iter=1000, l1_ratio=study.best_params['l1_ratio'])
model_lr.fit(X_train, y_train)
print('logistic regression')
print('train:', accuracy_score(y_train, model_lr.predict(X_train)))
print('test:', accuracy_score(y_test, model_lr.predict(X_test)))

study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0), direction='maximize')
study.optimize(objective_rfc, n_trials=20)
model_rfc = RandomForestClassifier(n_estimators=study.best_params['n_estimators'],
                                   max_depth=study.best_params['max_depth'],
                                   random_state=study.best_params['random_state'])
model_rfc.fit(X_train, y_train)
print('random forest classifier')
print('train:', accuracy_score(y_train, model_rfc.predict(X_train)))
print('test:', accuracy_score(y_test, model_rfc.predict(X_test)))

[I 2023-09-18 21:30:06,117] A new study created in memory with name: no-name-3fa6d95f-48bd-4709-9c08-781e4a6de43a
[I 2023-09-18 21:30:21,165] Trial 0 finished with value: 0.7316341829085458 and parameters: {'random_state': 6, 'l1_ratio': 0.7000000000000001}. Best is trial 0 with value: 0.7316341829085458.
[I 2023-09-18 21:30:32,113] Trial 1 finished with value: 0.7301349325337332 and parameters: {'random_state': 6, 'l1_ratio': 0.5}. Best is trial 0 with value: 0.7316341829085458.
[I 2023-09-18 21:30:48,485] Trial 2 finished with value: 0.7316341829085458 and parameters: {'random_state': 4, 'l1_ratio': 0.7000000000000001}. Best is trial 0 with value: 0.7316341829085458.
[I 2023-09-18 21:31:20,914] Trial 3 finished with value: 0.7361319340329835 and parameters: {'random_state': 4, 'l1_ratio': 0.9}. Best is trial 3 with value: 0.7361319340329835.
[I 2023-09-18 21:31:30,955] Trial 4 finished with value: 0.7293853073463268 and parameters: {'random_state': 10, 'l1_ratio': 0.4}. Best is trial

logistic regression
train: 0.7391304347826086
test: 0.7338830584707646


[I 2023-09-18 21:37:08,136] Trial 0 finished with value: 0.7256371814092953 and parameters: {'n_estimators': 101, 'max_depth': 4, 'random_state': 6}. Best is trial 0 with value: 0.7256371814092953.
[I 2023-09-18 21:37:11,388] Trial 1 finished with value: 0.7263868065967016 and parameters: {'n_estimators': 101, 'max_depth': 3, 'random_state': 7}. Best is trial 1 with value: 0.7263868065967016.
[I 2023-09-18 21:37:16,246] Trial 2 finished with value: 0.7286356821589205 and parameters: {'n_estimators': 99, 'max_depth': 5, 'random_state': 10}. Best is trial 2 with value: 0.7286356821589205.
[I 2023-09-18 21:37:20,321] Trial 3 finished with value: 0.7256371814092953 and parameters: {'n_estimators': 98, 'max_depth': 4, 'random_state': 5}. Best is trial 2 with value: 0.7286356821589205.
[I 2023-09-18 21:37:25,249] Trial 4 finished with value: 0.7293853073463268 and parameters: {'n_estimators': 101, 'max_depth': 5, 'random_state': 0}. Best is trial 4 with value: 0.7293853073463268.
[I 2023-09-

random forest classifier
train: 0.736319340329835
test: 0.7233883058470765


### logistic regression
parameters: {'random_state': 6, 'l1_ratio': 1.0}  
train: 0.7391304347826086  
valid: 0.7368815592203898  
test: 0.7338830584707646
### random forest classifier
parameters: {'n_estimators': 101, 'max_depth': 5, 'random_state': 0}  
train: 0.736319340329835  
valid: 0.7293853073463268  
test: 0.7233883058470765