In [27]:
#데이터 핸들링
import pandas as pd
import numpy as np

#전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor

#모델
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

#하이퍼 파라미터 튜닝: 보통 랜덤서치로 대략 파악한 다음 그리드 서치로 미세조정
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


#평가
from sklearn.metrics import f1_score

In [8]:
# Data Import
train_data=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [9]:
# x,y 정의
y=train_data['target'] # a real disaster (1) or not (0)
x=train_data.iloc[:,1:4]
x.info()
print('\n')
y.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
dtypes: object(3)
memory usage: 178.6+ KB




target
0    4342
1    3271
Name: count, dtype: int64

In [10]:
# location 결측치 너무 많음
print(x.isnull().sum(),'\n')
for i in range(len(x.columns)):
    print(x.columns[i],'의 결측치 비율:', round(x.isnull().sum().iloc[i]/len(y),3))

keyword       61
location    2533
text           0
dtype: int64 

keyword 의 결측치 비율: 0.008
location 의 결측치 비율: 0.333
text 의 결측치 비율: 0.0


In [11]:
#Case1 : 결측치를 전부 결측치가 아닌 공란으로 만들기
x1=x.fillna('unknown')
x1.isnull().sum()

keyword     0
location    0
text        0
dtype: int64

In [16]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import torch

# 1. 트위터 특화 BERT 로드
model_name = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 2. 문장 리스트를 CLS 임베딩으로 벡터화하는 함수
def get_cls_batch(texts, batch_size=16, max_len=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    all_embeddings = []
    dataloader = DataLoader(texts, batch_size=batch_size)

    for batch in dataloader:
        # 트윗 전처리 (선택적)
        batch = [text.replace("http://", "").replace("https://", "").strip() for text in batch]

        # 토크나이징
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS 토큰 임베딩

        all_embeddings.extend(cls_embeddings.cpu().numpy())

    return all_embeddings

vector1 = get_cls_batch(x1['keyword'].tolist())
vector2 = get_cls_batch(x1['location'].tolist())
vector3 = get_cls_batch(x1['text'].tolist())

In [20]:
keyword=np.stack(vector1)
location=np.stack(vector2)
text=np.stack(vector3)

7613 7613 7613


In [21]:
# Case1-1 : 데이터를 합쳐서 사용
df=np.concatenate([keyword,location,text],axis=1)
print(df.shape)

(7613, 2304)


In [31]:
p=pd.DataFrame(np.cumsum(PCA().fit(df).explained_variance_ratio_))
p[p.iloc[:,0]>0.9]

Unnamed: 0,0
301,0.900293
302,0.900760
303,0.901225
304,0.901685
305,0.902144
...,...
2299,0.999999
2300,0.999999
2301,0.999999
2302,0.999999


In [34]:
pca = PCA(n_components=301)
df= pd.DataFrame(pca.fit_transform(df))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,0.014092,-0.123945,-0.142184,-0.425904,0.685301,0.149631,0.353998,0.445765,-0.107565,-0.185683,...,0.027942,-0.022497,0.074955,-0.012075,0.012085,0.026739,0.014626,-0.010231,0.009789,0.007717
1,0.219201,0.029803,-0.187321,-0.340861,-0.064373,-0.309095,-0.012501,-0.141574,-0.089826,-0.212193,...,-0.023303,-0.000163,0.026552,0.003693,-0.065101,-0.002748,-0.004274,0.040424,-0.035756,-0.015860
2,0.166935,0.050866,-0.148126,-0.514017,-0.060950,-0.182907,0.074843,0.140386,0.166474,-0.384613,...,-0.006713,0.008448,-0.010423,-0.002486,0.004433,-0.004818,-0.007612,-0.031186,0.030037,0.011061
3,0.092933,0.034081,-0.151584,-0.492058,0.111674,-0.116132,0.324615,0.349019,-0.054421,-0.337842,...,-0.003601,-0.051723,-0.000733,-0.029129,0.011546,0.001578,0.003999,-0.008043,-0.043656,-0.032592
4,0.280417,0.082046,-0.136604,-0.431632,-0.075906,-0.313430,0.293265,0.065464,-0.364440,-0.027242,...,0.002079,-0.001229,0.055245,-0.034899,-0.019940,-0.000396,-0.010321,0.010310,0.067268,-0.022750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,-0.135857,-0.135019,-0.248022,-0.171471,-0.340348,-0.085507,0.131151,-0.005382,-0.009572,-0.004848,...,0.000275,0.011613,0.000212,-0.024138,0.021211,0.008918,-0.020473,-0.014121,0.008027,0.006852
7609,0.120862,-0.064160,-0.164857,-0.259708,0.100073,0.103671,0.258643,0.200761,-0.272180,-0.245657,...,-0.021092,-0.004242,-0.038088,-0.014086,-0.016554,0.008352,0.009711,0.002904,0.002505,0.004681
7610,-0.488671,-0.229063,-0.255212,-0.154704,-0.114804,0.034344,0.223804,-0.073223,0.009734,0.181741,...,-0.006033,0.020926,0.005501,0.036226,-0.020360,-0.048811,0.026738,-0.075187,-0.041987,-0.049796
7611,0.210056,0.055381,-0.104942,-0.557153,0.002414,-0.232016,-0.059444,-0.001768,-0.271950,-0.059093,...,-0.011622,-0.051627,-0.004642,0.012247,-0.016464,-0.041628,0.036180,0.036378,0.032289,-0.014012


In [35]:
#스케일링 (향후 다른 방법으로 스케일링이 필요할 수도 있습니다.)
df=StandardScaler().fit_transform(df)
df=pd.DataFrame(df)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,0.033274,-0.371930,-0.470534,-1.491461,2.582176,0.617263,1.637028,2.165637,-0.538849,-1.000218,...,0.803058,-0.646782,2.172337,-0.351186,0.352222,0.781682,0.431089,-0.302140,0.291240,0.231544
1,0.517580,0.089433,-0.619907,-1.193649,-0.242552,-1.275088,-0.057812,-0.687800,-0.449984,-1.143016,...,-0.669735,-0.004673,0.769521,0.107407,-1.897399,-0.080348,-0.125985,1.193748,-1.063761,-0.475853
2,0.394170,0.152636,-0.490198,-1.800019,-0.229657,-0.754533,0.346103,0.682030,0.833953,-2.071790,...,-0.192939,0.242870,-0.302079,-0.072312,0.129205,-0.140845,-0.224367,-0.920947,0.893605,0.331874
3,0.219434,0.102269,-0.501643,-1.723121,0.420783,-0.479072,1.501151,1.695623,-0.272622,-1.819848,...,-0.103489,-1.487013,-0.021248,-0.847174,0.336499,0.046120,0.117856,-0.237529,-1.298783,-0.977871
4,0.662124,0.246201,-0.452068,-1.511519,-0.286008,-1.292972,1.356175,0.318043,-1.825669,-0.146742,...,0.059745,-0.035323,1.601110,-1.014999,-0.581159,-0.011567,-0.304213,0.304461,2.001264,-0.682564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,-0.320787,-0.405159,-0.820788,-0.600468,-1.282411,-0.352735,0.606494,-0.026145,-0.047951,-0.026117,...,0.007902,0.333858,0.006132,-0.702035,0.618201,0.260711,-0.603410,-0.417010,0.238792,0.205596
7609,0.285382,-0.192529,-0.545567,-0.909464,0.377071,0.427666,1.196071,0.975350,-1.363492,-1.323279,...,-0.606187,-0.121954,-1.103882,-0.409670,-0.482474,0.244142,0.286229,0.085761,0.074517,0.140440
7610,-1.153857,-0.687364,-0.844582,-0.541752,-0.432576,0.141675,1.034960,-0.355738,0.048764,0.978982,...,-0.173383,0.601622,0.159423,1.053579,-0.593410,-1.426920,0.788071,-2.220323,-1.249145,-1.494037
7611,0.495987,0.166186,-0.347289,-1.951078,0.009094,-0.957117,-0.274891,-0.008587,-1.362338,-0.318315,...,-0.334003,-1.484259,-0.134549,0.356191,-0.479856,-1.216937,1.066368,1.074252,0.960616,-0.420391


In [36]:
x_train,x_test,y_train,y_test=train_test_split(df,y,train_size=0.7,random_state=123)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.85, random_state=123)


* 과적합 확인: 학습데이터 정확도- 검증 데이터 정확도
* 0 ~ 0.05 (0~5%): 과적합 의심 적음, 안정적


In [37]:
# Case1-1-1: 랜덤포레스트

RFmodel=RandomForestClassifier(random_state=123).fit(x_train,y_train)
y_pred1=RFmodel.predict(x_test)

train_score=RFmodel.score(x_train,y_train)
validation_score=RFmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred1),3)) 

0.226 => 과적합
f1 score = 0.714


In [None]:
#하이퍼 파라미터 튜닝

p = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 0.5],
    'bootstrap': [True, False]
}
g1=RandomizedSearchCV(RFmodel,param_distributions=p,cv=5,n_iter=50,random_state).fit(x_train,y_train)
g2=GridSearchCV(RFmodel,param_grid=p,cv=5).fit(x_train,y_train)
g1.best_params_
g2.best_params_

In [None]:
RFmodel=RandomForestClassifier(n_estimators=200,max_depth=10,bootstrap=True,max_depth=,min_samples_split,min_samples_leaf=,max_features=).fit(x_train,y_train)
y_pred1=RFmodel.predict(x_test)


train_score=RFmodel.score(x_train,y_train)
validation_score=RFmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred1),3)) 

In [38]:
# Case1-1-2: NaiveBayes

NBmodel=GaussianNB().fit(x_train,y_train)
y_pred2=NBmodel.predict(x_test)


train_score=NBmodel.score(x_train,y_train)
validation_score=NBmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred2),3)) 

0.034 => 과적합 아님
f1 score = 0.614


In [40]:
# Case 1-1-3: 로지스틱

LRmodel=LogisticRegression(max_iter=1000).fit(x_train,y_train)
y_pred3=LRmodel.predict(x_test)

train_score=LRmodel.score(x_train,y_train)
validation_score=LRmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred3),3)) 

0.041 => 과적합 아님
f1 score = 0.778


In [None]:
p={'C':[0.001,0.01,0.1,1],'solver':['lbfgs'],'max_iter':[500,1000,1500],'class_weight': [None, 'balanced']}

g1=RandomizedSearchCV(LRmodel,param_distributions=p,cv=5,n_iter=50,random_state).fit(x_train,y_train)
g2=GridSearchCV(LRmodel,param_grid=p,cv=5).fit(x_train,y_train)
print(g1.best_params_)
print(g2.best_params_)

In [None]:
LRmodel=LogisticRegression(max_iter= 500,C= 0.001,class_weight= None).fit(x_train,y_train)
y_pred3=LRmodel.predict(x_test)

train_score=LRmodel.score(x_train,y_train)
validation_score=LRmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred3),3)) 

In [41]:
# Case 1-1-4: SVM

SVCmodel=SVC().fit(x_train,y_train)
y_pred4=SVCmodel.predict(x_test)


train_score=SVCmodel.score(x_train,y_train)
validation_score=SVCmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred4),3)) 

0.127 => 과적합
f1 score = 0.779


In [None]:
#하이퍼 파라미터 튜닝
from sklearn.model_selection import RandomizedSearchCV
p = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}

g1=RandomizedSearchCV(SVCmodel,param_distributions=p,cv=5,n_iter=50,random_state).fit(x_train,y_train)
g2=GridSearchCV(SVCmodel,param_grid=p,cv=5).fit(x_train,y_train)
print(g1.best_params_)
print(g2.best_params_)

In [None]:
SVCmodel=SVC(C=,gamma=,kernel='').fit(x_train,y_train)
y_pred4=SVCmodel.predict(x_test)


train_score=SVCmodel.score(x_train,y_train)
validation_score=SVCmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred4),3)) 

In [43]:
# Case 1-1-5: 의사결정트리

DTCmodel=DecisionTreeClassifier(random_state=123).fit(x_train,y_train)
y_pred5=DTCmodel.predict(x_test)

train_score=DTCmodel.score(x_train,y_train)
validation_score=DTCmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred5),3)) 

0.31 => 과적합
f1 score = 0.644


In [None]:
p={'criterion':['gini','entrophy'],'max_depth':range(5,21).tolist,'class_weight':[None,'balanced']}

g1=RandomizedSearchCV(DTCmodel,param_distributions=p,cv=5,n_iter=50,random_state).fit(x_train,y_train)
g2=GridSearchCV(DTCmodel,param_grid=p,cv=5).fit(x_train,y_train)
print(g1.best_params_)
print(g2.best_params_)

In [None]:
DTCmodel=DecisionTreeClassifier(criterion='',max_depth=,class_weight=).fit(x_train,y_train)
y_pred5=DTCmodel.predict(x_test)


train_score=DTCmodel.score(x_train,y_train)
validation_score=DTCmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred5),3)) 

In [44]:
# Case 1-1-6: XGBooster
XGBmodel= XGBClassifier().fit(x_train,y_train)
y_pred6=XGBmodel.predict(x_test)

train_score=XGBmodel.score(x_train,y_train)
validation_score=XGBmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred6),3)) 

0.215 => 과적합
f1 score = 0.757


In [None]:
p = {'max_depth': [3, 5, 7],             
    'learning_rate': [0.01, 0.1, 0.2],  
    'n_estimators': [100, 200, 300],   
    'subsample': [0.6, 0.8, 1.0],       
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],             
    'min_child_weight': [1, 3, 5],      
    'scale_pos_weight': [1, sum(negative) / sum(positive)]}

g1=RandomizedSearchCV(XGBmodel,param_distributions=p,cv=5,n_iter=50,random_state).fit(x_train,y_train)
g2=GridSearchCV(XGBmodel,param_grid=p,cv=5).fit(x_train,y_train)
print(g1.best_params_)
print(g2.best_params_)

In [None]:
XGBmodel= XGBClasifier(max_depth=,learning_rate=,n_estimators=,subsamle=,colsample_bytree=,gamma=,min_child_weight=,scale_pos_weight=).fit(x_train,y_train)
y_pred6=XGBmodel.predict(x_test)

train_score=XGBmodel.score(x_train,y_train)
validation_score=XGBmodel.score(x_val,y_val)

differ=round((train_score-validation_score),3)


if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')


print('f1 score =',round(f1_score(y_test,y_pred6),3)) 