In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

# FEATRUE ENGINEERING

In [2]:
data['first_party'].value_counts()

United States       154
Illinois              9
Maryland              8
Florida               8
New York              7
                   ... 
David Carpenter       1
Larry Gene Heath      1
PGA TOUR, Inc.        1
PPL Montana, LLC      1
Markman               1
Name: first_party, Length: 2110, dtype: int64

In [3]:
data['second_party'].value_counts()

United States                        240
California                            19
United States of America              15
Illinois                              13
Federal Communications Commission     10
                                    ... 
David Boren, Governor of Oklahoma      1
Federal Bureau of Prisons et al.       1
Town of Harrison                       1
Charles Burr et al.                    1
Westview Instruments, Inc.             1
Name: second_party, Length: 1974, dtype: int64

In [4]:
# 출력 옵션 설정
# pd.set_option("display.max_rows", None)  # 모든 행 표시
pd.set_option("display.max_columns", None)  # 모든 열 표시
pd.set_option("display.width", None)  # 줄 바꿈 없이 전체 내용 표시
data['facts']

0       On June 27, 1962, Phil St. Amant, a candidate ...
1       Ramon Nelson was riding his bike when he suffe...
2       An Alabama state court convicted Billy Joe Mag...
3       Victor Linkletter was convicted in state court...
4       On April 24, 1953 in Selma, Alabama, an intrud...
                              ...                        
2473    Congress amended the Clean Air Act through the...
2474    Alliance Bond Fund, Inc., an investment fund, ...
2475    In 1992, the District Court sentenced Manuel D...
2476    On March 8, 1996, Enrico St. Cyr, a lawful per...
2477    Herbert Markman owns the patent to a system th...
Name: facts, Length: 2478, dtype: object

In [5]:
data.iloc[1]['facts']

'Ramon Nelson was riding his bike when he suffered a lethal blow to the back of his head with a baseball bat. After two eyewitnesses identified Lawrence Owens from an array of photos and then a lineup, he was tried and convicted for Nelson’s death. Because Nelson was carrying cocaine and crack cocaine potentially for distribution, the judge at Owens’ bench trial ruled that Owens was probably also a drug dealer and was trying to “knock [Nelson] off.” Owens was found guilty of first-degree murder and sentenced to 25 years in prison.\nOwens filed a petition for a writ of habeas corpus on the grounds that his constitutional right to due process was violated during the trial. He argued that the eyewitness identification should have been inadmissible based on unreliability and that the judge impermissibly inferred a motive when a motive was not an element of the offense. The district court denied the writ of habeas corpus, and Owens appealed. The U.S. Court of Appeals for the Seventh Circuit

In [6]:
data.iloc[0]['facts']


'On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. \nThompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court held that the First Amendment protects uninhibited, robust debate, rather t

In [2]:
data

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


# TF-IDF

In [2]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
# 특수 문자 및 숫자 제거
class TextVectorizer:
    def __init__(self, data):
        self.data = data
        
    def remove_special_characters(self, text):
        text = re.sub(r'\W+', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        return text

    # 소문자 변환
    def convert_to_lowercase(self, text):
        return text.lower()

    # 토큰화
    def tokenize_text(self, text):
        tokens = word_tokenize(text)
        return tokens

    # 불용어 제거
    def remove_stopwords(self, tokens):
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        return filtered_tokens

    # 어간 추출
    # def stem_tokens(tokens):
    #     stemmer = PorterStemmer()
    #     stemmed_tokens = [stemmer.stem(token) for token in tokens]
    #     return stemmed_tokens

    # 표제어 추출
    def lemmatize_tokens(self, tokens):
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    
    def run(self):
        def apy(text):
            text = self.remove_special_characters(text)
            text = self.convert_to_lowercase(text)
            tokens = self.tokenize_text(text)
            tokens = self.remove_stopwords(tokens)
            # tokens = stem_tokens(tokens)
            tokens = self.lemmatize_tokens(tokens)
            preprocessed_text = ' '.join(tokens)
            return preprocessed_text
        
        data = self.data.apply(apy)

        return data
        

V = TextVectorizer(data = data['facts'])
data['facts'] = V.run()
V = TextVectorizer(data = test['facts'])
test['facts'] = V.run()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kweon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kweon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# TfidfVectorizer 적용
vectorizer = TfidfVectorizer()
vectorizer.fit(data['facts'])

train = vectorizer.transform(data['facts'])
ptest = vectorizer.transform(test['facts'])




In [25]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

X_train, X_val, y_train, y_val = train_test_split(train, data['first_party_winner'], test_size=0.2, random_state=23, shuffle=True, stratify=data['first_party_winner'])
train_data = Pool(data=X_train, label=y_train)
valid_data = Pool(data=X_val, label=y_val)

model_cat = CatBoostClassifier(iterations = 10000, eval_metric='F1',
                               task_type='GPU')
model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=500, verbose=100)

Learning rate set to 0.025898
0:	learn: 0.8035441	test: 0.7956204	best: 0.7956204 (0)	total: 71.6ms	remaining: 11m 55s
100:	learn: 0.8129430	test: 0.7980416	best: 0.8004866 (32)	total: 6.07s	remaining: 9m 54s
200:	learn: 0.8340183	test: 0.7861557	best: 0.8004866 (32)	total: 11.8s	remaining: 9m 35s
300:	learn: 0.8482315	test: 0.7784731	best: 0.8004866 (32)	total: 17.4s	remaining: 9m 19s
400:	learn: 0.8605942	test: 0.7773585	best: 0.8004866 (32)	total: 23s	remaining: 9m 9s
500:	learn: 0.8739217	test: 0.7735369	best: 0.8004866 (32)	total: 28.6s	remaining: 9m 2s
bestTest = 0.800486618
bestIteration = 32
Shrink model to first 33 iterations.


<catboost.core.CatBoostClassifier at 0x2ae9b52fa60>

In [50]:
pred = model.predict(ptest)

In [53]:
pd.DataFrame(pred)

1    1218
0      22
dtype: int64

In [8]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(loss_function='CrossEntropy')

model.fit(train.toarray(), data['first_party_winner'])

0:	learn: 0.6896416	total: 53.4ms	remaining: 53.4s
1:	learn: 0.6862988	total: 105ms	remaining: 52.5s
2:	learn: 0.6828840	total: 158ms	remaining: 52.6s
3:	learn: 0.6797233	total: 209ms	remaining: 52s
4:	learn: 0.6768517	total: 260ms	remaining: 51.8s
5:	learn: 0.6739597	total: 312ms	remaining: 51.8s
6:	learn: 0.6715038	total: 373ms	remaining: 52.9s
7:	learn: 0.6689507	total: 423ms	remaining: 52.5s
8:	learn: 0.6666376	total: 475ms	remaining: 52.3s
9:	learn: 0.6646623	total: 530ms	remaining: 52.4s
10:	learn: 0.6628060	total: 581ms	remaining: 52.2s
11:	learn: 0.6608170	total: 632ms	remaining: 52s
12:	learn: 0.6586216	total: 683ms	remaining: 51.8s
13:	learn: 0.6568222	total: 736ms	remaining: 51.9s
14:	learn: 0.6549812	total: 787ms	remaining: 51.7s
15:	learn: 0.6533353	total: 836ms	remaining: 51.4s
16:	learn: 0.6519037	total: 888ms	remaining: 51.3s
17:	learn: 0.6504220	total: 940ms	remaining: 51.3s
18:	learn: 0.6491022	total: 994ms	remaining: 51.3s
19:	learn: 0.6477260	total: 1.04s	remaining:

<catboost.core.CatBoostClassifier at 0x1cb3a459460>

In [9]:
pred = model.predict(ptest)

In [5]:
trainset = pd.DataFrame(train.toarray())
trainset['label']= data['first_party_winner']

In [6]:
trainset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14868,14869,14870,14871,14872,14873,14874,14875,14876,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# autoglone

In [2]:
data = data.drop('ID', axis=1)
data

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [4]:
from autogluon.text import TextPredictor

label = 'first_party_winner'
eval_metric = 'accuracy'
time_limit = 3600 * 3

predictor = TextPredictor(
    label=label, eval_metric=eval_metric
).fit(data, presets='best_quality', time_limit=time_limit)



OSError: [WinError 127] 지정된 프로시저를 찾을 수 없습니다

# predict

In [10]:
sub = pd.read_csv('sample_submission.csv')
sub['first_party_winner'] = pred
sub

Unnamed: 0,ID,first_party_winner
0,TEST_0000,1
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
...,...,...
1235,TEST_1235,1
1236,TEST_1236,0
1237,TEST_1237,1
1238,TEST_1238,1


In [11]:
sub['first_party_winner'].value_counts()

1    1086
0     154
Name: first_party_winner, dtype: int64

In [12]:
sub.to_csv('submit_230613_catboost_original.csv', index=False)