In [None]:
!python --version

In [None]:
!pip list

In [None]:
!pip install transformers sentencepiece==0.1.97 fugashi ipadic datasets unidic-lite

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

row_data = pd.read_csv('/content/drive/MyDrive/ML/new_data_v2.csv')

row_data.head()

In [None]:
def transform_label(x):
  new_x = x
  if "a" in x:
    new_x = x[0]
  return int(new_x) -1

row_data["label"] = row_data["label"].map(transform_label)
row_data.head()

In [None]:
# from transformers import pipeline

# classifier = pipeline("summarization", model="megagonlabs/t5-base-japanese-web", tokenizer="megagonlabs/t5-base-japanese-web")

# def transform_content(x):
#   summary_content = classifier(x, truncation=True)[0]["summary_text"]
#   print("summary content: ")
#   print(summary_content)
#   return summary_content

import re

def transform_content(x):
  # return re.sub(r"https?://[\w!?/+\-_~;.,*&@#$%()'[\]]+", '',re.sub("\r\n|\n|\r ", "", x))
  return re.sub("--|ーー|＝＝|==|━━|__|…|\r\n|\n|\r", "", x)

row_data["content"] = row_data["content"].map(transform_content)
row_data.head()

# row_data.to_csv('/content/drive/MyDrive/ML/new_data_v2_summarized.csv')

In [None]:
def transform_merged(x):
  return x[0:2000]
row_data["content"] = row_data["content"].map(transform_merged)
row_data.head()

In [None]:
row_data["merged"] = row_data['from'].str.cat(
    row_data['title'].str.cat(row_data['content'], sep='[SEP]')
    , sep='[SEP]')
row_data = row_data.drop(['from', 'title', 'content'], axis=1)
row_data.head()

In [None]:
from sklearn.model_selection import train_test_split

train, valid_test = train_test_split(row_data, test_size=0.4, shuffle=True, random_state=1)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=1)

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({"train":Dataset.from_pandas(train), "test": Dataset.from_pandas(test), "validation": Dataset.from_pandas(valid)})
dataset["train"].features

In [None]:
from transformers import AutoTokenizer
   
model_ckpt = "cl-tohoku/bert-base-japanese-v2"
# model_ckpt = "nlp-waseda/roberta-base-japanese"
# model_ckpt="bandainamco-mirai/distilbert-base-japanese"
# model_ckpt = "ken11/albert-base-japanese-v1"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

In [None]:
def tokenize(batch):
    return tokenizer(batch["merged"], padding=True, truncation=True, max_length=512)

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 7

model = (AutoModelForSequenceClassification
    .from_pretrained(
        model_ckpt,
        num_labels=num_labels
    )
    .to(device)
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(dataset_encoded["train"])
model_name = "mail_classification_model"

# Dropout
# model = (AutoModelForSequenceClassification
#     .from_pretrained(
#         model_ckpt,
#         config=AutoConfig.from_pretrained(
#           model_ckpt,
#           hidden_dropout_prob=0.2,
#           attention_probs_dropout_prob=0.2,
#           num_labels=num_labels
#         )
#     )
#     .to(device)
# )

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.0001,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    push_to_hub=False,
    # learning_rate=5e-05,
    # label_smoothing_factor=0.1
)

In [None]:
from transformers import Trainer

# class CustomTrainer(Trainer):
#     # focal loss
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs[0]
#         alpha = 0.25
#         gamma = 2
#         ce_loss = torch.nn.CrossEntropyLoss(reduction='none')(logits, labels)
#         pt = torch.exp(-ce_loss)
#         focal_loss = alpha * (1-pt)**gamma * ce_loss
#         loss = torch.mean(focal_loss)
#         return (loss, outputs) if return_outputs else loss

trainer = Trainer(
    model=model,
    args=training_args,
    # data_collator=mail_collator,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    tokenizer=tokenizer
)
trainer.train()

In [None]:
preds_output = trainer.predict(dataset_encoded["validation"])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

print(compute_metrics(preds_output))

y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(dataset_encoded["validation"]["label"])

labels = [i for i in range(num_labels)]

In [None]:
id2label = {
    0: "就活 - インターン",
    1: "就活 - 説明会",
    2: "就活 - 本選考",
    3: "就活 - その他",
    4: "大学 - 課題",
    5: "大学 - その他",
    6: "その他"
}

label2id = {}
for i in range(num_labels):
    label2id[id2label[i]] = i

trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id

In [None]:
from matplotlib.axis import font_manager
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds)
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Confusion matrix")
    plt.show()

plot_confusion_matrix(y_preds, y_valid, labels)
print(id2label)

In [None]:
# test data
preds_output_test = trainer.predict(dataset_encoded["test"])
print(compute_metrics(preds_output_test))

y_preds_test = np.argmax(preds_output_test.predictions, axis=1)
y_test = np.array(dataset_encoded["test"]["label"])

plot_confusion_matrix(y_preds_test, y_test, labels)
print(id2label)

In [None]:
trainer.save_model(f"/content/drive/MyDrive/ML/mail_classification_model")