In [1]:
import pandas as pd
import numpy as np

import dacon_law_class as dlc
from dacon_law_class import SimpleOps as so

from autogluon.tabular import TabularDataset, TabularPredictor

import warnings
warnings.filterwarnings('ignore')


 ___________________________
|                           |
|==== DLC Well Imported ====|
|_______28th_Jun_2023_______|



### Training

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [3]:
train.info()
print("\n ------------------------------------ \n")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB

 ------------------------------------ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [4]:
df = pd.DataFrame(train['facts'])
df = dlc.law_preprocessor(df, 'facts')
train['facts'] = df['facts']
df = pd.DataFrame(train['first_party'])
df = dlc.law_preprocessor(df, 'first_party')
train['first_party'] = df['first_party']
df = pd.DataFrame(train['second_party'])
df = dlc.law_preprocessor(df, 'second_party')
train['second_party'] = df['second_party']
train_cleansed = train.drop(columns='ID')
train_cleansed

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,phil st amant,herman thompson,on june 27 1962 phil st amant cidate for publi...,1
1,stephen duncan,lawrence owens,ramon nelson was riding his bike when he suffe...,0
2,billy joe magwood,tony patterson warden,an alabama state court convicted billy joe mag...,1
3,linkletter,walker,victor linkletter was convicted in state court...,0
4,william earl fikes,alabama,on april 24 1953 in selma alabama an intruder ...,1
...,...,...,...,...
2473,hollyfrontier cheyenne refining llc,renewable fuels association,congress amended the clean air act through the...,1
2474,grupo mexicano de desarrollo,alliance bond fund,alliance bond fund an investment fund purchase...,1
2475,peguero,united states,in 1992 the district court sentenced manuel pe...,0
2476,immigration naturalization service,st cyr,on march 1996 enrico st cyr lawful permanent r...,0


In [5]:
df = pd.DataFrame(test['facts'])
df = dlc.law_preprocessor(df, 'facts')
test['facts'] = df['facts']
df = pd.DataFrame(test['first_party'])
df = dlc.law_preprocessor(df, 'first_party')
test['first_party'] = df['first_party']
df = pd.DataFrame(test['second_party'])
df = dlc.law_preprocessor(df, 'second_party')
test['second_party'] = df['second_party']
test_cleansed = test.drop(columns='ID')
test_cleansed

Unnamed: 0,first_party,second_party,facts
0,salerno,united states,the 1984 bail reform act allowed the federal c...
1,milberg weiss bershad hynes lerach,lexecon,lexecon was defendant in class action lawsuit ...
2,no 07 582 title federal communications commission,fox television stations,in 2002 2003 fox television stations broadcast...
3,harold kaufman,united states,during his trial for armed robbery of federall...
4,berger,hanlon,in 1993 magistrate judge issued warrant author...
...,...,...,...
1235,haitian centers council,chris sale acting commissioner immigration nat...,according to executive order no 12807 signed b...
1236,whitman,american trucking associations,section 109 of the clean air act requires the ...
1237,linda matteo john madigan,william barr,linda matteo john madigan created plan for uti...
1238,washington state apple advertising commission,hunt,in 1972 the north carolina board of agricultur...


In [6]:
train_facts = pd.DataFrame(train_cleansed['facts'])
train_facts

Unnamed: 0,facts
0,on june 27 1962 phil st amant cidate for publi...
1,ramon nelson was riding his bike when he suffe...
2,an alabama state court convicted billy joe mag...
3,victor linkletter was convicted in state court...
4,on april 24 1953 in selma alabama an intruder ...
...,...
2473,congress amended the clean air act through the...
2474,alliance bond fund an investment fund purchase...
2475,in 1992 the district court sentenced manuel pe...
2476,on march 1996 enrico st cyr lawful permanent r...


In [7]:
test_facts = pd.DataFrame(test_cleansed['facts'])
test_facts

Unnamed: 0,facts
0,the 1984 bail reform act allowed the federal c...
1,lexecon was defendant in class action lawsuit ...
2,in 2002 2003 fox television stations broadcast...
3,during his trial for armed robbery of federall...
4,in 1993 magistrate judge issued warrant author...
...,...
1235,according to executive order no 12807 signed b...
1236,section 109 of the clean air act requires the ...
1237,linda matteo john madigan created plan for uti...
1238,in 1972 the north carolina board of agricultur...


In [8]:
# first_party_berted = dlc.auto_tokenizer(train_cleansed, 'first_party')
# first_party_berted = first_party_berted.rename(columns={0:'first_party_berted'})
# first_party_berted

In [9]:
# second_party_berted = dlc.auto_tokenizer(train_cleansed, 'second_party')
# second_party_berted = second_party_berted.rename(columns={0:'second_party_berted'})
# second_party_berted

In [10]:
# facts_berted = dlc.auto_tokenizer(train_cleansed, 'facts')
# facts_berted = embedded_df_1.rename(columns={0:'facts_berted'})
# facts_berted

In [11]:
first_party_berted = pd.read_csv('./embeddings/first_party_berted.csv')
first_party_berted = dlc.new_tensor_separator(first_party_berted, 'first_party_berted')
first_party_berted = first_party_berted.astype('float64')

100%|█████████████████████████████████████| 2478/2478 [00:01<00:00, 1945.97it/s]


In [12]:
second_party_berted = pd.read_csv('./embeddings/second_party_berted.csv')
second_party_berted = dlc.new_tensor_separator(second_party_berted, 'second_party_berted')
second_party_berted = second_party_berted.astype('float64')


100%|█████████████████████████████████████| 2478/2478 [00:01<00:00, 1954.88it/s]


In [13]:
facts_berted = pd.read_csv('./embeddings/facts_berted.csv')
facts_berted = dlc.new_tensor_separator(facts_berted, 'facts_berted')
facts_berted = facts_berted.astype('float64')

100%|█████████████████████████████████████| 2478/2478 [00:01<00:00, 1959.76it/s]


In [14]:
all_ready_to_ml = pd.concat([first_party_berted, second_party_berted, facts_berted], axis=1)

all_ready_to_ml.columns = ([str(i) for i in range(len(all_ready_to_ml.columns))])
all_ready_to_ml = pd.concat([all_ready_to_ml, train_cleansed['first_party_winner']], axis=1)
all_ready_to_ml

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2295,2296,2297,2298,2299,2300,2301,2302,2303,first_party_winner
0,0.005036,0.006439,-0.012888,-0.007303,-0.017571,-0.029552,-0.006318,0.025047,-0.009394,-0.015124,...,-0.018174,0.001737,0.003518,-0.010968,-0.030724,-0.035971,-0.038689,-0.006275,-0.010507,1
1,-0.003902,0.026781,-0.044628,-0.032344,-0.001729,-0.018735,0.002111,-0.018207,-0.009919,-0.075807,...,-0.010969,0.006232,-0.012417,-0.020754,-0.016356,-0.039208,-0.031188,-0.014293,-0.026212,0
2,0.003050,0.009259,-0.017064,-0.019458,-0.006200,-0.008553,0.050010,0.035903,-0.030393,-0.020106,...,-0.011412,-0.006155,-0.010546,-0.024606,-0.025278,-0.054787,-0.016296,-0.009530,-0.012053,1
3,-0.015234,-0.000875,0.015897,-0.018534,0.054608,-0.030493,0.041842,0.057857,-0.013273,-0.019490,...,-0.019637,-0.005936,-0.003335,-0.026111,-0.015403,0.000960,-0.027738,0.005133,-0.012935,0
4,-0.011956,-0.009987,0.020198,0.009508,0.024592,-0.037784,0.020635,-0.023835,-0.029138,-0.031670,...,-0.033741,0.007560,-0.023639,-0.028438,-0.010833,-0.030627,-0.035033,-0.000732,-0.024343,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,-0.020591,-0.056972,-0.033879,0.031382,0.079985,0.025763,-0.006569,0.044251,-0.008797,-0.060979,...,-0.027422,0.000112,0.015122,0.009355,-0.046905,-0.002683,-0.008214,-0.000282,-0.009470,1
2474,-0.023678,-0.022937,-0.022214,0.013166,0.054840,0.011287,0.016339,0.090016,0.000314,0.001474,...,-0.017667,0.015822,-0.002527,0.004892,-0.039568,-0.052614,-0.034712,-0.004466,0.008349,1
2475,-0.004295,-0.029256,-0.015316,-0.010641,0.007141,0.021234,0.005180,0.064935,-0.022453,-0.034121,...,-0.011819,0.019265,-0.019696,-0.024496,-0.042203,-0.044682,-0.023756,-0.004682,0.003538,0
2476,0.021019,-0.003120,-0.007031,0.007962,0.045658,0.014659,0.024269,-0.021894,-0.032928,-0.029230,...,-0.023799,0.002132,-0.011824,-0.020932,-0.033988,-0.028924,-0.045331,-0.013245,-0.016124,0


In [15]:
# test_first_party_berted = dlc.auto_tokenizer(test_cleansed, 'test_first_party_berted')
# test_first_party_berted = test_first_party_berted.rename(columns={0:'first_party_berted'})
# test_first_party_berted

In [16]:
# test_second_party_berted = dlc.auto_tokenizer(test_cleansed, 'test_second_party_berted')
# test_second_party_berted = test_second_party_berted.rename(columns={0:'second_party_berted'})
# test_second_party_berted

In [17]:
# test_facts_berted = dlc.auto_tokenizer(test_cleansed, 'test_facts_berted')
# test_facts_berted = test_facts_berted.rename(columns={0:'test_facts_berted'})
# test_facts_berted

In [18]:
test_first_party_berted = pd.read_csv('./embeddings/test_first_party_berted.csv')
test_first_party_berted = dlc.new_tensor_separator(test_first_party_berted, 'first_party_berted')
test_first_party_berted = test_first_party_berted.astype('float64')

100%|█████████████████████████████████████| 1240/1240 [00:00<00:00, 1980.47it/s]


In [19]:
test_second_party_berted = pd.read_csv('./embeddings/test_second_party_berted.csv')
test_second_party_berted = dlc.new_tensor_separator(test_second_party_berted, 'second_party_berted')
test_second_party_berted = test_second_party_berted.astype('float64')


100%|█████████████████████████████████████| 1240/1240 [00:00<00:00, 1983.49it/s]


In [20]:
test_facts_berted = pd.read_csv('./embeddings/test_facts_berted.csv')
test_facts_berted = dlc.new_tensor_separator(test_facts_berted, 'facts_berted')
test_facts_berted = test_facts_berted.astype('float64')


100%|█████████████████████████████████████| 1240/1240 [00:00<00:00, 1978.81it/s]


In [21]:
test_ready_to_ml = pd.concat([test_first_party_berted, test_second_party_berted, test_facts_berted], axis=1)
test_ready_to_ml.columns = ([str(i) for i in range(len(test_ready_to_ml.columns))])
test_ready_to_ml

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
0,-0.006672,-0.080011,-0.014580,-0.038840,0.048455,-0.040639,0.039663,0.068547,0.002239,-0.029425,...,-0.012600,-0.023597,-0.003138,-0.008353,-0.034796,-0.020804,-0.006913,-0.038063,0.008657,-0.021999
1,-0.003053,-0.011769,-0.034803,-0.041324,-0.026498,-0.014561,0.012917,-0.009658,-0.000563,-0.018914,...,-0.018383,-0.027521,0.004810,-0.006279,-0.019910,-0.044318,-0.049594,-0.022413,0.012250,0.003738
2,-0.025638,-0.016005,-0.021478,0.014413,0.048054,-0.036683,-0.034054,0.043429,-0.017806,-0.045973,...,-0.023804,-0.017397,0.008190,-0.010989,-0.014011,-0.041264,-0.009615,-0.012293,-0.000732,0.010219
3,0.028991,0.036599,-0.059128,-0.030722,-0.017587,-0.028222,0.052064,-0.009327,0.006705,-0.008745,...,-0.021711,-0.015069,-0.001111,-0.014473,-0.025876,-0.006428,-0.018803,-0.028281,-0.004001,-0.012030
4,0.036941,-0.039247,-0.026675,-0.006587,-0.022472,-0.016113,0.046348,-0.012971,0.029547,-0.051456,...,-0.016862,-0.034918,0.005033,0.000040,-0.037341,-0.040531,-0.014745,-0.042890,-0.000467,-0.010147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,-0.017606,-0.026507,-0.038503,-0.013017,0.036128,-0.006750,-0.023582,0.004234,-0.010011,-0.024067,...,-0.010572,-0.034447,-0.002972,-0.012302,-0.004949,-0.046846,-0.039191,-0.034175,-0.013973,0.006327
1236,-0.001220,-0.002364,-0.025906,-0.023937,0.002847,-0.021732,0.076602,-0.014772,-0.012813,-0.014318,...,-0.000953,-0.013990,0.002872,-0.030309,-0.013520,-0.053392,-0.012986,-0.014506,-0.011636,0.024630
1237,0.001026,0.023440,0.013377,0.016586,0.024960,-0.001395,0.063505,-0.017101,-0.032935,-0.016052,...,-0.018592,-0.016234,0.018842,-0.002355,-0.015415,-0.033917,-0.057270,-0.028927,-0.005387,0.000249
1238,0.001634,-0.000814,-0.014695,0.003300,0.001420,0.028769,-0.025222,0.065449,-0.011581,-0.044858,...,0.004752,0.003640,0.013376,-0.006307,-0.005089,-0.044533,-0.016824,-0.018459,-0.004400,0.022938


In [22]:
X = all_ready_to_ml.drop(columns='first_party_winner')
y = all_ready_to_ml.first_party_winner

In [23]:
!mkdir -p 'model'
predictor = TabularPredictor(label='first_party_winner',path='model').fit(all_ready_to_ml, presets=['best_quality'])


Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "model/"
AutoGluon Version:  0.8.0
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.5.0: Mon Apr 24 20:53:44 PDT 2023; root:xnu-8796.121.2~5/RELEASE_ARM64_T8103
Disk Space Avail:   38.62 GB / 245.11 GB (15.8%)
Train Data Rows:    2478
Train Data Columns: 2304
Label Column: first_party_winner
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generato

In [24]:
preds_solid = predictor.predict(test_ready_to_ml)
preds_proba = predictor.predict_proba(test_ready_to_ml)


In [25]:
print(preds_solid.value_counts())
# preds_proba


1    1228
0      12
Name: first_party_winner, dtype: int64


In [26]:
sample_submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [27]:
final_submission = pd.concat([sample_submission['ID'], preds_solid], axis=1)
final_submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,1
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
...,...,...
1235,TEST_1235,1
1236,TEST_1236,1
1237,TEST_1237,1
1238,TEST_1238,1


In [28]:
final_submission.to_csv("./full_bert_autogluon_submission.csv", index=False)