In [1]:
import pandas as pd

import sys  
sys.path.append('../')

import modules.exploratory_data_analysis.base_rates as base_rates
import modules.data_wrangling.feature_selection as feature_selection
import modules.utils.utils as utils
import modules.classification.classifiers as classifiers

# Pre-computations

## Get features classification

In [2]:
fc = feature_selection.features_classification_lists(file_name = r"..\..\data\features_classification.csv")
fc

Group
Court decisions            [CD_JOINT, CD_JO_CF, WIN_SL, WINWIN, CD_FH_AT,...
Defendants's gender                                               [DEFEN_ML]
Facts                      [FT_RP_b, FT_RP, FT_CW_b, FT_CW, FT_CC_b, FT_C...
Facts detail               [FT_RP_SL_b, FT_RP_SL, FT_RP_JN_b, FT_RP_JN, F...
Facts index                     [FT_SL_IN_b, FT_JN_IN_b, FT_SL_IN, FT_JN_IN]
Judge gender                                                      [JUDGE_ML]
Judicial resolution                                    [ID, URL, DATE, YEAR]
Legal norms                                         [LN_JNPREF_b, LN_JNPREF]
Legal principles           [LP_BI_b, LP_BI, LP_PE_b, LP_PE, LP_RA_b, LP_R...
Legal principles detail    [LP_BI_SL_b, LP_BI_SL, LP_BI_JN_b, LP_BI_JN, L...
Plainfiff's gender                                                [PLAIN_ML]
Plainfiff's requests       [RQ_JOINT, RQ_FH_AT, RQ_FH_SP, RQ_MP_AT, RQ_MP...
Name: Short name, dtype: object

## Define main cols to remove

In [3]:
remove_cols = fc["Judicial resolution"] + fc["Court decisions"] + ['JUDGE_ID', 'AUT_COMM', 'HQ']
print(remove_cols)

['ID', 'URL', 'DATE', 'YEAR', 'CD_JOINT', 'CD_JO_CF', 'WIN_SL', 'WINWIN', 'CD_FH_AT', 'CD_FH_SP', 'CD_MP_AT', 'CD_MP_SP', 'JUDGE_ID', 'AUT_COMM', 'HQ']


## Get feature importances

In [4]:
# Load data
df = pd.read_csv("..\..\output\df_cleaned.csv", sep=";")

# Divide by request type
df_request_joint, df_request_joint = base_rates.df_by_group(df, "RQ_JOINT")

In [5]:
# Define function to get feature importances
def get_feature_importances(df):
    hide_cols_model = fc["Judicial resolution"] + fc["Court decisions"] + ['AUT_COMM', 'HQ', 'DEFEN_ML']
    feature_importances = classifiers.rf_classifier(df, "WINWIN", hide_cols_model, printInfo = False)
    feature_importances = classifiers.sort_feature_importances(feature_importances).drop(columns='std')
    feature_importances = feature_importances[~feature_importances['features'].isin(remove_cols)]
    return feature_importances

### All data

In [6]:
feature_importances = get_feature_importances(df)
feature_importances

Unnamed: 0,features,coefficients
0,RQ_JOINT,0.181706
1,FT_RA,0.077591
2,LP_BI,0.066642
3,FT_RD,0.063286
4,FT_CC,0.06034
6,PLAIN_ML,0.05867
7,LP_BI_b,0.053715
8,FT_CC_b,0.045894
9,FT_RA_b,0.032079
10,RQ_MP_SP,0.02997


### Request sole

In [7]:
feature_importances_sole = get_feature_importances(df_request_joint)
feature_importances_sole

Unnamed: 0,features,coefficients
0,LP_BI,0.156832
1,LP_BI_b,0.153141
2,FT_RA,0.136817
3,FT_CC,0.100524
4,FT_CC_b,0.091845
5,FT_RA_b,0.072623
6,FT_RD,0.056845
8,FT_RD_b,0.032685
9,FT_CW,0.020035
10,FT_CR,0.018326


### Request joint

In [8]:
feature_importances_joint = get_feature_importances(df_request_joint)
feature_importances_joint

Unnamed: 0,features,coefficients
0,LP_BI_b,0.156138
1,LP_BI,0.152217
2,FT_RA,0.137072
3,FT_CC,0.096343
4,FT_CC_b,0.09532
5,FT_RA_b,0.068306
6,FT_RD,0.056327
7,FT_RD_b,0.034485
9,FT_CW,0.020565
10,FT_CR,0.017298


# Distance 1 (D1) - All

Euclidean distance considering all the arguments used by the court as well as all the request variables

In [9]:
# Define cols to use
all_cols = utils.flatten_list_of_lists(fc)
D1_cols = [i for i in all_cols if i not in (remove_cols)]

# Visualize used and removed cols
print("The columns REMOVED for distance 1 computation are: \n" + str(remove_cols))
print("\nThe columns USED for distance 1 computation are: \n" + str(D1_cols))

The columns REMOVED for distance 1 computation are: 
['ID', 'URL', 'DATE', 'YEAR', 'CD_JOINT', 'CD_JO_CF', 'WIN_SL', 'WINWIN', 'CD_FH_AT', 'CD_FH_SP', 'CD_MP_AT', 'CD_MP_SP', 'JUDGE_ID', 'AUT_COMM', 'HQ']

The columns USED for distance 1 computation are: 
['DEFEN_ML', 'FT_RP_b', 'FT_RP', 'FT_CW_b', 'FT_CW', 'FT_CC_b', 'FT_CC', 'FT_CR_b', 'FT_CR', 'FT_RA_b', 'FT_RA', 'FT_RD_b', 'FT_RD', 'FT_PD_b', 'FT_PD', 'FT_AG_b', 'FT_AG', 'FT_RP_SL_b', 'FT_RP_SL', 'FT_RP_JN_b', 'FT_RP_JN', 'FT_CW_SL_b', 'FT_CW_SL', 'FT_CW_JN_b', 'FT_CW_JN', 'FT_CC_SL_b', 'FT_CC_SL', 'FT_CC_JN_b', 'FT_CC_JN', 'FT_CR_SL_b', 'FT_CR_SL', 'FT_CR_JN_b', 'FT_CR_JN', 'FT_RA_SL_b', 'FT_RA_SL', 'FT_RA_JN_b', 'FT_RA_JN', 'FT_RD_SL_b', 'FT_RD_SL', 'FT_RD_JN_b', 'FT_RD_JN', 'FT_PD_SL_b', 'FT_PD_SL', 'FT_PD_JN_b', 'FT_PD_JN', 'FT_AG_SL_b', 'FT_AG_SL', 'FT_AG_JN_b', 'FT_AG_JN', 'FT_SL_IN_b', 'FT_JN_IN_b', 'FT_SL_IN', 'FT_JN_IN', 'JUDGE_ML', 'LN_JNPREF_b', 'LN_JNPREF', 'LP_BI_b', 'LP_BI', 'LP_PE_b', 'LP_PE', 'LP_RA_b', 'LP_RA', 'LP

In [10]:
D1_cols_df = pd.DataFrame(D1_cols)
D1_cols_df.to_csv('D1_cols.csv', sep=";", index=False)

# Distance 2 (D2) - Relevant

Weighted euclidean distance with the most relevant variables in the probability of winning.

In [11]:
D2_cols_df = feature_importances
D2_cols_df.to_csv('D2_cols.csv', sep=";", index=False)

In [12]:
D2_cols_df_sole = feature_importances_sole
D2_cols_df_sole.to_csv('D2_cols_sole.csv', sep=";", index=False)

In [13]:
D2_cols_df_joint = feature_importances_joint
D2_cols_df_joint.to_csv('D2_cols_joint.csv', sep=";", index=False)

# Distance 3 (D3) - Top

Weighted euclidean distance with the top 10 most relevant variables in the probability of winning.

In [14]:
D3_cols_df = feature_importances.head(10)
D3_cols_df.to_csv('D3_cols.csv', sep=";", index=False)

In [15]:
D3_cols_df_sole = feature_importances_sole.head(10)
D3_cols_df_sole.to_csv('D3_cols_sole.csv', sep=";", index=False)

In [16]:
D3_cols_df_joint = feature_importances_joint.head(10)
D3_cols_df_joint.to_csv('D3_cols_joint.csv', sep=";", index=False)