Simple eval of features for each para and manual classifications

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D, axes3d
from matplotlib import cm
import pandas as pd
import seaborn as sns
from matplotlib.artist import setp
from matplotlib.ticker import FormatStrFormatter
from IPython.display import set_matplotlib_formats
from matplotlib.collections import PolyCollection
sns.set()
sns.set_context("paper")
sns.set_color_codes("pastel")

sns.set_context({"figure.figsize": (16, 10)})
plt.style.use('seaborn-white')

In [None]:
def read_classification_file(path: str): 
    lookup = {}
    categories = {}
    with open(path) as f:
        for line in f:
            parts = line.split()
            id_parts = parts[0].split('-')
            if len(id_parts) == 4:
                _id = ''.join(id_parts[:3])+'-'+id_parts[3]
            else:
                _id = parts[0]
            if len(parts) != 2:
                print(line)
            else:    
                if parts[1] not in categories:
                    categories[parts[1]] = set()
                    
                categories[parts[1]].add(_id)
                lookup[_id] = parts[1]
                
    return lookup, categories

In [None]:
def read_features_file(path: str, classified):
    out = {}
    seen = {}
    with open(path) as f:
        for line in f:
            parts = line.split()
            features = []
            id_parts = parts[0].split('-')
            doc = ''
            if len(id_parts) == 4:
                doc = ''.join(id_parts[:3])
                _id = ''.join(id_parts[:3])+'-'+id_parts[3]
            else:
                _id = parts[0]
                
            features.append(int(id_parts[-1]))
                
            for p in parts[1:]:
                features.append(float(p))
                
            if doc in seen and _id in classified: 
                    features.append(seen[doc])
            else: 
                features.append(0)
                if doc not in seen or seen[doc] == 0: 
                    if _id in classified and classified[_id] == 'law':
                        seen[doc] = 1
                else:
                    seen[doc] = 0
                
            out[_id] = features
    return out 

In [None]:
classified_path = '../features/para-classification.txt'
features_path = '../features/out.txt'

classified_lookup, categories = read_classification_file(classified_path)

In [None]:
features_lookup = read_features_file(features_path, classified_lookup)

In [None]:
def combine_lookup(features, classified): 
    prev_doc = '' 
    prev = ''
    out = {}
    for _id in features:
        doc = _id.split('-')[0]
        if doc == prev_doc: 
            if features[_id][2] == 1: 
                for i in range(1, len(features[prev])):
                    features[prev][i] += features[_id][i]
            else: 
                prev = _id 
                out[_id] = features[_id]
        
        prev_doc = doc             
        
    return out 

combined_lookup = combine_lookup(features_lookup, classified_lookup)

In [None]:
features = {x: features_lookup[x] + [classified_lookup[x]] for x in features_lookup if x in classified_lookup}

In [None]:
combined_features = {x: combined_lookup[x] + [classified_lookup[x]] for x in combined_lookup if x in classified_lookup}

In [None]:
features_df = pd.DataFrame.from_dict(features, orient='index', columns=['pos', 'para_num', 'quote', 'num_tokens', 'num_day', 'num_date', 'num_time', 'num_month', 'num_cit', 'num_case', 'num_cl', 'num_sec', 'num_leg', 'num_pin', 'num_ref', 'num_para', 'num_enum', 'num_judg', 'num_court', 'num_person', 'num_party', 'num_coy', 'num_acn', 'num_abn', 'num_ent', 'num_money', 'num_percent', 'num_ground', 'num_ex', 'num_pld', 'num_tscpt', 'num_ab', 'num_decno', 'num_fileno', 'num_report', 'num_order', 'num_secondary', 'seen_law', 'type'])
combined_features_df = pd.DataFrame.from_dict(combined_features, orient='index', columns=['pos', 'para_num', 'quote', 'num_tokens', 'num_day', 'num_date', 'num_time', 'num_month', 'num_cit', 'num_case', 'num_cl', 'num_sec', 'num_leg', 'num_pin', 'num_ref', 'num_para', 'num_enum', 'num_judg', 'num_court', 'num_person', 'num_party', 'num_coy', 'num_acn', 'num_abn', 'num_ent', 'num_money', 'num_percent', 'num_ground', 'num_ex', 'num_pld', 'num_tscpt', 'num_ab', 'num_decno', 'num_fileno', 'num_report', 'num_order', 'num_secondary', 'seen_law', 'type'])

# features_df = pd.DataFrame.from_dict(features, orient='index', columns=['pos', 'para_num', 'quote', 'num_tokens', 'num_person', 'num_entity', 'num_case', 'num_cit', 'num_sec', 'num_leg', 'num_ref', 'num_para', 'num_clause', 'num_money', 'num_pinpoint', 'num_judge', 'num_date', 'num_acn', 'num_abn', 'num_percent', 'num_time', 'num_order', 'seen_law', 'type'])
# combined_features_df = pd.DataFrame.from_dict(combined_features, orient='index', columns=['pos', 'para_num', 'quote', 'num_tokens', 'num_person', 'num_entity', 'num_case', 'num_cit', 'num_sec', 'num_leg', 'num_ref', 'num_para', 'num_clause', 'num_money', 'num_pinpoint', 'num_judge', 'num_date', 'num_acn', 'num_abn', 'num_percent', 'num_time', 'num_order', 'seen_law', 'type'])

In [None]:
features_df

In [None]:
def num_para(df):
    out = {}
    for i in df.index:
        parts = i.split('-')
        if parts[0] not in out: 
            out[parts[0]] = int(parts[1])
        else:
            v = int(parts[1]) 
            if v > out[parts[0]]:
                out[parts[0]] = v
        
    return out 
doc_paras = num_para(features_df)
comb_doc_paras = num_para(combined_features_df)


def add_rel_pos(df, lookup):
    new = []
    for i in df.index: 
        parts = i.split('-')
        doc = lookup[parts[0]]
        new.append(float(parts[1])/float(doc))
    df['rel_pos'] = new
    
add_rel_pos(features_df, doc_paras)
add_rel_pos(combined_features_df, comb_doc_paras)

In [None]:
def calc(df):
    overall = df['type'].value_counts().to_dict()

    comp = {}
    for x in df.columns:
        if not (x == 'heading' or x == 'pos' or x == 'type' or x  == 'para_num' or x == 'num_tokens'):
            comp[x] = df[df[x] > 0]['type'].value_counts().to_dict()
    
    return comp, overall 

comp, overall = calc(features_df)
comp_comb, overall_comb = calc(combined_features_df)

In [None]:
(pd.DataFrame.from_dict(comp).T / pd.Series(overall)).T

In [None]:
(pd.DataFrame.from_dict(comp_comb).T / pd.Series(overall_comb)).T

In [None]:
def calc_rule_var1(df, overall):
    return pd.Series(df[(df.num_case > 0) | (df.num_cit > 0) | (df.num_sec > 0) | (df.num_leg > 0)]['type'].value_counts().to_dict()) / pd.Series(overall)
    
def calc_rule_var2(df, overall):
    return pd.Series(df[(df.num_case > 0) | (df.num_cit > 0) | (df.num_sec > 0) | (df.num_leg > 0) | (df.num_judge > 0)]['type'].value_counts().to_dict()) / pd.Series(overall)

def calc_rule_var3(df, overall):
    return pd.Series(df[(df.num_case > 0) | (df.num_cit > 0) | (df.num_sec > 0) | (df.num_leg > 0) | (df.seen_law > 0)]['type'].value_counts().to_dict()) / pd.Series(overall)



print(calc_rule_var1(features_df, overall))
print('-'*30)
print(calc_rule_var1(combined_features_df, overall_comb))
print('-'*30)
print(calc_rule_var2(features_df, overall))
print('-'*30)
print(calc_rule_var2(combined_features_df, overall_comb))
print('-'*30)
print(calc_rule_var3(features_df, overall))
print('-'*30)
print(calc_rule_var3(combined_features_df, overall_comb))

In [None]:
features_df['seen_law'].value_counts()

In [None]:
features_df.groupby('type').mean()

In [None]:
def plot_cat_box(df, feature: str):
    groups = df.groupby('type')
    types = df['type'].unique()
    fig, axs = plt.subplots(1, len(types))
    fig.set_size_inches(16, 8)
    for i in range(len(types)):
        axs[i].boxplot(df.loc[groups.groups[types[i]]][feature])
        axs[i].set_xlabel(types[i], fontsize='20', rotation=90)

plot_cat_box(features_df, 'num_tokens')
plot_cat_box(features_df, 'rel_pos')

In [None]:
plot_cat_box(combined_features_df, 'num_tokens')
plot_cat_box(combined_features_df, 'rel_pos')

In [None]:
features_df['quote'].value_counts()

In [None]:
# fig = plt.subplot()
# fig.set_size_inches(16, 8)
# fig.scatter(features_df['num_tokens'], features_df['rel_pos'])
groups = features_df.groupby('type')

fig, ax = plt.subplots()
fig.set_size_inches(16, 8)
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groups:
    ax.plot(group.num_tokens, group.rel_pos, marker='o', linestyle='', label=name)
ax.legend()

plt.show()

In [None]:
features_df[(features_df.index.str.contains('2000QCA011')) & ((features_df.num_case > 0) | (features_df.num_cit > 0) | (features_df.num_sec > 0) | (features_df.num_leg > 0))]

In [None]:
features_df[features_df.index.str.contains('2000QCA011')]