In [1]:
# put absolute path here
DATA_DIR = r"C:\Users\User\Downloads\saos-dump-23.02.2018.tar\saos-dump-23.02.2018\data\json"

In [2]:
import os
import json

class DataManager:
    JUDGMENT_DATE_KEY = "judgmentDate"
    COURT_TYPE_KEY = "courtType"

    def __init__(self, data_dir):
        self.data_dir = data_dir

    def get_judgment_year(self, json_content):
        return int(json_content[self.JUDGMENT_DATE_KEY][:4])

    def judgments_generator(self, year='all'):
        for filename in os.listdir(self.data_dir):
            if filename.endswith('.json') and filename.startswith('judgments'):
                with open(os.path.join(self.data_dir, filename), 'r', encoding="utf8") as content_file:
                    content = content_file.read()
                    parsed = json.loads(content)
                    for judgment in parsed["items"]:
                        if year != 'all' and year == self.get_judgment_year(judgment) or year == 'all':           
                            try:
                                if(judgment[self.COURT_TYPE_KEY] in ["COMMON", "SUPREME"]):
                                    yield judgment
                            except:
                                print("not common or supreme: " + str(judgment["caseNumber"]))

data_manager = DataManager(DATA_DIR)

In [3]:
# Actual homework starts here
import re
generator = data_manager.judgments_generator(year=2009)

In [4]:
def contain_digit(x):
    return re.search(r'\d', x) is not None

def is_top_20(word):
    tops = ['na', 'do', 'art', 'nie', 'że', 'przez', 'ust', 'się', 'dnia', 'jest', 'oraz', 'ustawy', 'od', 'sąd', 'nr',
           'postępowania', 'pkt', 'tym', 'za', 'sądu']
    return word in tops

def create_bag_of_words(text):
    return [x.lower() for x in re.findall(r'\b\w\w+\b', text, re.UNICODE) 
                       if not contain_digit(x) and not is_top_20(x)]

def remove_html(x):
    return re.sub("<[^>]*>", "", x)

def remove_linebreaks(x):
    return re.sub("-\n", "", x)

In [5]:
def extract_justification(content):
    text = remove_linebreaks(remove_html(content["textContent"]))
    justification_string = 'UZASADNIENIE'
    splitted = re.split(justification_string, text, flags=re.IGNORECASE, maxsplit=1)
    if len(splitted) > 1:
        return splitted[1]
    print("No justification: %d" % content['id'])
    return False

In [6]:
import re

judgments_groups_regex = {
    r'A?C.*': ('sprawy cywilne', []),
    r'A?U.*': ('sprawy z zakresu ubezpieczenia społecznego', []),
    r'A?K.*': ('sprawy karne', []),
    r'G.*': ('sprawy gospodarcze', []),
    r'A?P.*': ('sprawy w zakresie prawa pracy', []),
    r'R.*': ('sprawy w zakresie prawa rodzinnego', []),
    r'W.*': ('sprawy o wykroczenia', []),
    r'Am.*': ('sprawy w zakresie prawa konkurencji', []),
}

def assign_to_group(content):
    for judg_regex in judgments_groups_regex.keys():
        if re.match(judg_regex, content['courtCases'][0]['caseNumber'].split()[1]):
            justification = extract_justification(content)
            if justification:
                judgments_groups_regex[judg_regex][1].append(create_bag_of_words(justification))
            return
    print('No groups ' + str(content['courtCases']))       

In [7]:
c = 0
while True: 
    try: 
        content = generator.__next__()
        assign_to_group(content)
        c += 1
    except StopIteration:
        break

for judg_regex in judgments_groups_regex.keys():
    print("%s: %d" % (judgments_groups_regex[judg_regex][0], len(judgments_groups_regex[judg_regex][1])))
print("Total number of judgments: %d" % c)

No groups [{'caseNumber': 'I BP 12/08'}]
No justification: 89404
No justification: 89408
No justification: 89409
No justification: 89411
No justification: 89422
No justification: 89423
No justification: 89424
No justification: 89425
No justification: 89426
No groups [{'caseNumber': 'III SK 16/08'}]
No groups [{'caseNumber': 'III SK 17/08'}]
No justification: 89431
No groups [{'caseNumber': 'SNO 90/08'}]
No groups [{'caseNumber': 'SNO 93/08'}]
No groups [{'caseNumber': 'SNO 94/08'}]
No justification: 89436
No justification: 89440
No justification: 89441
No justification: 89452
No groups [{'caseNumber': 'III SK 24/08'}]
No groups [{'caseNumber': 'III SK 26/08'}]
No justification: 89439
No justification: 20527
No justification: 89433
No justification: 89434
No justification: 89515
No justification: 89509
No justification: 89513
No justification: 89493
No justification: 89556
No groups [{'caseNumber': 'SNO 85/08'}]
No groups [{'caseNumber': 'SNO 86/08'}]
No groups [{'caseNumber': 'SNO 95/0

No justification: 90775
No justification: 90783
No groups [{'caseNumber': 'III SW 48/09'}]
No justification: 90778
No justification: 90779
No justification: 90780
No justification: 90799
No justification: 90801
No justification: 90802
No groups [{'caseNumber': 'KSP 13/09'}]
No justification: 90811
No groups [{'caseNumber': 'III SK 9/09'}]
No justification: 90816
No justification: 90817
No justification: 90818
No groups [{'caseNumber': 'SNO 58/09'}]
No justification: 90850
No justification: 90851
No justification: 90873
No justification: 90858
No justification: 90859
No justification: 189530
No justification: 15749
No justification: 90870
No groups [{'caseNumber': 'SNO 62/09'}]
No groups [{'caseNumber': 'SNO 63/09'}]
No groups [{'caseNumber': 'SNO 64/09'}]
No justification: 90903
No justification: 90904
No justification: 90909
No justification: 90913
No groups [{'caseNumber': 'III SW 47/09'}]
No groups [{'caseNumber': 'SNO 59/09'}]
No groups [{'caseNumber': 'SNO 60/09'}]
No groups [{'ca

In [8]:
import random

def split_dataset(data, test_ratio=0.25):   
    random.shuffle(data)
    split_num = int(test_ratio * len(data))
    train_data = data[split_num:]
    test_data = data[:split_num]
    return train_data, test_data

In [9]:
import requests 

def send_post_request(sample):
    text = ' '.join(sample)
    r = requests.post("http://192.168.99.100:9200/", data=text.encode('utf-8') )
    r.encoding = 'utf-8'
    return r.text

def process_post_response(text):
    flexed = []
    base = []
    lines = text.splitlines()
    for index, line in enumerate(lines):
        if ':' in line:
            try:
                splitted = line.split(':', 1)[0].split()
                line_before = lines[index-1]
                base.append(splitted[0].lower())
                flexed.append(line_before.split()[0])
            except:
                print(line)
    return flexed, base

In [10]:
%%time

training_x = []
training_y = []
test_x = []
test_y = []
base_training_x= []
base_test_x = []
for key in (r'A?C.*', r'A?U.*', r'A?K.*', r'A?P.*'):
    data = judgments_groups_regex[key][1]
    train_data, test_data = split_dataset(data)
    for sample in train_data:
        flexed, base = process_post_response(send_post_request(sample))
        base_training_x.append(base)
        training_x.append(flexed)
        training_y.append(key)
    for sample in test_data:
        flexed, base = process_post_response(send_post_request(sample))
        base_test_x.append(base)
        test_x.append(flexed)
        test_y.append(key)

Wall time: 2h 10min 56s


In [46]:
print(len(training_x))
print(len(training_y))
print(len(test_x))
print(len(test_y))
print(len(base_training_x))
print(len(base_test_x))

1405
1405
468
468
1405
468


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
def prepare_tfidf(training, test):
    vectorizer = TfidfVectorizer()
    training = vectorizer.fit_transform([' '.join(sample) for sample in training])
    test = vectorizer.transform([' '.join(sample) for sample in test])
    return training, test

transformed_training_x, transformed_test_x = prepare_tfidf(training_x, test_x)
transformed_base_training_x, transformed_base_test_x = prepare_tfidf(base_training_x, base_test_x)

print(transformed_training_x.shape)
print(transformed_base_training_x.shape)
print(transformed_test_x.shape)
print(transformed_base_test_x.shape)

(1405, 46198)
(1405, 14861)
(468, 46198)
(468, 14861)


In [83]:
clf_base = svm.SVC(C=100, kernel='rbf', gamma=0.01)
clf_base.fit(transformed_base_training_x, training_y)

clf_flexed = svm.SVC(C=100, kernel='rbf', gamma=0.01)
clf_flexed.fit(transformed_training_x, training_y)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [84]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix
def make_predition(clf, test_x, test_y, text=""):
    prediction_result = clf.predict(test_x)
#     print(prediction_result)
    print(text)
    print("accuracy " + str(accuracy_score(test_y, prediction_result)))
    print("confusion matrix: \n" + str(confusion_matrix(test_y, prediction_result)))
    print(str(classification_report(test_y, prediction_result, 
                                    target_names=[judgments_groups_regex[key][0] for key in clf.classes_])))
    print("micro-average: " + str(precision_recall_fscore_support(test_y, prediction_result, average='micro')[:-1]))
    print("macro-average: " + str(precision_recall_fscore_support(test_y, prediction_result, average='macro')[:-1]))
    
make_predition(clf_base, transformed_base_test_x, test_y, text="BASE")
make_predition(clf_flexed, transformed_test_x, test_y, text="\nFLEXED") 

BASE
accuracy 0.961538461538
confusion matrix: 
[[368   0   2   0]
 [  0  25   0   0]
 [  9   1  32   0]
 [  3   0   3  25]]
                                            precision    recall  f1-score   support

                            sprawy cywilne       0.97      0.99      0.98       370
                              sprawy karne       0.96      1.00      0.98        25
             sprawy w zakresie prawa pracy       0.86      0.76      0.81        42
sprawy z zakresu ubezpieczenia społecznego       1.00      0.81      0.89        31

                               avg / total       0.96      0.96      0.96       468

micro-average: (0.96153846153846156, 0.96153846153846156, 0.96153846153846156)
macro-average: (0.94870609475872636, 0.89073774235064551, 0.9161773038329255)

FLEXED
accuracy 0.963675213675
confusion matrix: 
[[368   0   2   0]
 [  0  25   0   0]
 [ 10   0  32   0]
 [  4   0   1  26]]
                                            precision    recall  f1-score   support