### Задание

Попробуйте поработать с датасетом юридических текстов. В датасете всего две важных колонки признаков: заголовок дела и его текст, а целевая переменная - case_outcome (мультиклассовая классификация). 

В базовом варианте можно оставить только текст дела, если хотите поинтереснее - можно попробовать распарсить case_title, добыв оттуда дополнительные признаки. 

https://www.kaggle.com/datasets/amohankumar/legal-text-classification-dataset

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kiril\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kiril\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
data = pd.read_csv('legal_text_classification.csv')
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB


In [16]:
data.dropna(inplace=True)

In [17]:
data.case_outcome.unique()

array(['cited', 'applied', 'followed', 'referred to', 'related',
       'considered', 'discussed', 'distinguished', 'affirmed', 'approved'],
      dtype=object)

In [18]:
data.case_outcome.value_counts()

case_outcome
cited            12110
referred to       4363
applied           2438
followed          2252
considered        1699
discussed         1018
distinguished      603
related            112
approved           108
affirmed           106
Name: count, dtype: int64

Выборка несбалансирована.

In [19]:
def preprocess_text(text):
    """ 
    Предобработка текстовых данных.
    """
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Удаление ссылок
    text = text.lower() # Приведение к нижнему регистру
    text = re.sub(r'[^\w\s]', '', text) # Удаление пунктуации
    tokens = nltk.word_tokenize(text) # Токенизация
    tokens = [word for word in tokens if word not in stop_words] # Удаление стоп-слов
    tokens = [lemmatizer.lemmatize(word) for word in tokens] # Лемматизация
    return ' '.join(tokens)

In [20]:
def feature_engineering(choice_transformer, choice_ngrams):
    """ 
    Возвращает векторизатор текста с указанными параметрами. 
    Параметр max_features=5000 выбран для оптимизации скорости обработки данных и не должен существенно влиять на качество модели.
    """
    if choice_transformer == 'tfidf':
        text_transformer = TfidfVectorizer(ngram_range=choice_ngrams, max_features=5000)
    else:
        text_transformer = CountVectorizer(ngram_range=choice_ngrams, max_features=5000)

    return text_transformer

In [21]:
def modelfit(model):
    """
    Обучает переданную модель и выводит оценки точности на тестовом и обучающем наборах данных.
    """
    model.fit(Xtrain, ytrain)
    
    ypredtest = model.predict(Xtest)
    ypredtrain = model.predict(Xtrain)
    
    print(accuracy_score(ytest, ypredtest), accuracy_score(ytrain, ypredtrain))

In [22]:
data['processed'] = data['case_text'].apply(preprocess_text)
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text,processed
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,general principle governing exercise discretio...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,general principle governing exercise discretio...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,preceding general principle inform exercise di...


In [23]:
X = data['processed']  
y = data['case_outcome']  

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

TF-IDF unigrams

In [176]:
preprocessor = feature_engineering('tfidf', (1, 1))

clfLR = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

clfSVC = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", SVC(random_state=42))
])

In [177]:
modelfit(clfLR)
modelfit(clfSVC)

0.5330511890366788 0.6169698191162393
0.586658605401048 0.7908500025192724


TF-IDF bigrams

In [178]:
preprocessor = feature_engineering('tfidf', (2, 2))

clfLR = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

clfSVC = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", SVC(random_state=42))
])

In [179]:
modelfit(clfLR)
modelfit(clfSVC)

0.5322450624748085 0.6055323222653298
0.5751713018943975 0.7681261651635007


In [180]:
preprocessor = feature_engineering('tfidf', (1, 2))

clfLR = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

clfSVC = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", SVC(random_state=42))
])

In [181]:
modelfit(clfLR)
modelfit(clfSVC)

0.5388956066102378 0.6168690482188743
0.5900846432889963 0.7898422935456241


BOW unigrams

In [182]:
preprocessor = feature_engineering('bow', (1, 1))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=1000, random_state=42))]
)

clfSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", SVC(random_state=42))]
)

In [183]:
modelfit(clfLR)
modelfit(clfSVC)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5342603788794841 0.8828034463646899
0.5004030632809351 0.5465813473068978


BOW bigrams

In [184]:
preprocessor = feature_engineering('bow', (2, 2))

clfLR = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=1000, random_state=42))]
)

clfSVC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", SVC(random_state=42))]
)

In [185]:
modelfit(clfLR)
modelfit(clfSVC)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5439338976219267 0.8285887035824054
0.5110842402257154 0.5551972590315917


BOW Bagging

In [166]:
preprocessor = feature_engineering('bow', (1, 1))

bagging = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier(random_state=42, n_jobs=-1))]
)

In [167]:
modelfit(bagging)

0.5705360741636437 0.9421575049125812


In [168]:
preprocessor = feature_engineering('bow', (2, 2))

bagging = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier(random_state=42, n_jobs=-1))]
)

In [169]:
modelfit(bagging)

0.5634824667472793 0.9375220436337985


In [211]:
preprocessor = feature_engineering('bow', (1, 2))

bagging = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier(random_state=42, n_jobs=-1))]
)

In [212]:
modelfit(bagging)

0.5745667069729947 0.9431148284375472


Random Forest

In [24]:
preprocessor = feature_engineering('bow', (1, 1))

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))]
)

modelfit(clf)

0.592503022974607 0.9577266085554492


In [174]:
preprocessor = feature_engineering('bow', (2, 2))

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))]
)

modelfit(clf)

0.5923014913341395 0.9575250667607195


In [175]:
preprocessor = feature_engineering('bow', (1, 2))

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))]
)

modelfit(clf)

0.5929060862555421 0.9577266085554492


1. Комбинация униграмм и биграмм ((1, 2)) обычно дает лучшие результаты.
2. TF-IDF в целом показывает более высокие результаты, чем BOW.
3. Среди использованных классификаторов (логистическая регрессия, SVC, бэггинг, случайный лес), случайный лес показывает одни из лучших результатов, хотя стоит также отметить высокий уровень переобучения (большая разница между точностью на тренировочной и тестовой выборках).  
SVC с TF-IDF (униграммы и биграммы) оказался почти так же эффективен, как и Random Forest.

Doc2Vec

In [25]:
import gensim
import multiprocessing

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn import utils
from tqdm import tqdm

In [62]:
def preprocess_text_doc2vec(text):
    """ 
    Предобработка текстовых данных.
    """
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Удаление ссылок
    text = text.lower()  # Приведение к нижнему регистру
    text = re.sub(r'[^\w\s]', '', text)  # Удаление пунктуации
    tokens = nltk.word_tokenize(text)  # Токенизация
    tokens = [word for word in tokens if word not in stop_words]  # Удаление стоп-слов
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Лемматизация
    return tokens

In [63]:
data['processed_doc2vec'] = data['case_text'].apply(preprocess_text_doc2vec)
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text,processed,processed_doc2vec
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...,"[ordinarily, discretion, exercised, cost, foll..."
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,general principle governing exercise discretio...,"[general, principle, governing, exercise, disc..."
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,ordinarily discretion exercised cost follow ev...,"[ordinarily, discretion, exercised, cost, foll..."
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,general principle governing exercise discretio...,"[general, principle, governing, exercise, disc..."
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,preceding general principle inform exercise di...,"[preceding, general, principle, inform, exerci..."


In [64]:
train, test = train_test_split(data[['processed_doc2vec', 'case_outcome']], test_size=0.2, random_state=42)

In [65]:
# Создание TaggedDocument
train_tagged = [TaggedDocument(words=text, tags=[str(tag)]) for text, tag in zip(train['processed_doc2vec'], train['case_outcome'])]
test_tagged = [TaggedDocument(words=text, tags=[str(tag)]) for text, tag in zip(test['processed_doc2vec'], test['case_outcome'])]

In [66]:
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, workers=cores)
model_dbow.build_vocab(train_tagged)

for epoch in range(30):
    model_dbow.train(utils.shuffle(train_tagged), total_examples=len(train_tagged), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [67]:
def vec_for_learning(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in tagged_docs])
    return targets, regressors

y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(solver='liblinear', n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(f'Testing accuracy {accuracy_score(y_test, y_pred)}')
print(f'Testing F1 score: {f1_score(y_test, y_pred, average="weighted")}')

Testing accuracy 0.49113260781942764
Testing F1 score: 0.3248769908599491


In [69]:
random_forest_clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)

random_forest_clf.fit(X_train, y_train)
y_pred_rf = random_forest_clf.predict(X_test)

print(f'Testing accuracy {accuracy_score(y_test, y_pred_rf)}')
print(f'Testing F1 score: {f1_score(y_test, y_pred_rf, average="weighted")}')

Testing accuracy 0.4913341394598952
Testing F1 score: 0.3265479763232657


Doc2Vec дал результаты ниже ожидаемого.  
В целом, выбор метода векторизации и классификатора довольно сильно влияет на качество.