**변수 선택법**  
label을 결정하는데 있어 중요하지 않은 변수를 제거하는 것

**PCA**  
변수의 개수를 줄이면서도 원래 데이터를 잘 표현할 수 있도록 하는 것

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
import plotly.express as px

  import pandas.util.testing as tm


In [2]:
# 데이터 불러오기
train_df = pd.read_csv('/content/drive/MyDrive/최종프로젝트_7조/dataset/NJ_Dataset/train_features.csv')
test_df = pd.read_csv('/content/drive/MyDrive/최종프로젝트_7조/dataset/NJ_Dataset/test_features.csv')

train_df = train_df[train_df['label'] != -1]
test_df = test_df[test_df['label'] != -1]

train_df = train_df.set_index('sha256')
test_df = test_df.set_index('sha256')

x_train, y_train = train_df.drop('label', axis=1), train_df['label']
x_test, y_test = test_df.drop('label', axis=1), test_df['label']

In [3]:
# 스케일 조정
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [4]:
# Random Forest (기본)
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

f1_score(y_test, y_pred), accuracy_score(y_test, y_pred)

(0.9860346590250954, 0.986135)

# 변수 선택법

In [5]:
## 전진 단계별 선택법 : https://zephyrus1111.tistory.com/65
variables = x_train.columns.tolist()  # 설명 변수 리스트

y = y_train.tolist()  # 반응 변수
selected_variables = list()  # 선택된 변수들
sl_enter = 0.05
sl_remove = 0.05

sv_per_step = list()  # 각 스텝별로 선택된 변수들
adjusted_r_squared = list()  # 각 스텝별 수정된 결정계수
steps = list()  # 스텝
step = 0
while len(variables) > 0:
    remainder = list(set(variables) - set(selected_variables))
    pval = pd.Series(index=remainder)  # 변수의 p-value
    # 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    # 선형 모형을 적합한다.
    for col in remainder: 
        X = x_train[selected_variables + [col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit()
        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter:  # 최소 p-value 값이 기준 값보다 작으면 포함
        selected_variables.append(pval.idxmin())
        # 선택된 변수들에대해서
        # 어떤 변수를 제거할지 고른다.
        while len(selected_variables) > 0:
            selected_X = x_train[selected_variables]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit().pvalues[1:]  # 절편항의 p-value는 뺀다
            max_pval = selected_pval.max()
            if max_pval >= sl_remove:  # 최대 p-value값이 기준값보다 크거나 같으면 제외
                remove_variable = selected_pval.idxmax()
                selected_variables.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(x_train[selected_variables])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables.copy())
    else:
        break





In [6]:
# 선택된 변수들
x_train = x_train[selected_variables]
x_test = x_test[selected_variables]

selected_variables

['g_imports',
 'g_symbols',
 'g_has_relocations',
 's_numstrings',
 's_printables',
 'g_has_resources',
 's_entropy',
 'g_has_debug',
 'h_minor_linker_version',
 'g_has_tls',
 'g_has_signature',
 'h_timestamp',
 's_avlength',
 's_MZ',
 'g_exports',
 's_registry',
 'h_major_linker_version',
 'g_size',
 'h_minor_subsystem_version',
 'h_sizeof_headers',
 'h_major_image_version',
 'h_major_operating_system_version',
 's_urls',
 's_paths',
 'h_minor_operating_system_version']

In [7]:
# Random Forest (변수 선택법)
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

f1_score(y_test, y_pred), accuracy_score(y_test, y_pred)

(0.9838112369061475, 0.983935)

# PCA

In [8]:
# 설명력 확인
pca = PCA()
pca.fit(x_train)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
exp_var_cumul

array([0.12347179, 0.20906825, 0.28989579, 0.35024773, 0.40132277,
       0.44853576, 0.49279165, 0.53522984, 0.57612029, 0.61628486,
       0.65626286, 0.6955874 , 0.73298102, 0.76697328, 0.79873819,
       0.82820487, 0.85610692, 0.88212794, 0.9065763 , 0.93067595,
       0.9507259 , 0.96923091, 0.98348886, 0.9973516 , 1.        ])

In [9]:
# 변수의 개수와 설명력 관계 그래프 : 사용할 PCA 수 결정
px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={'x': '변수 개수', 'y': '변수의 설명력'}
)

PCA가 너무 적으면 원래 데이터를 잘 설명하지 못하고 너무 많으면 변수를 축소시키는 의미가 없다.

In [10]:
# 50%를 넘기는 변수 8개를 기준으로 진행
N_COMPONENTS = 8
pca = PCA(n_components=N_COMPONENTS)
pca_train = pca.fit_transform(x_train)
pca_test = pca.transform(x_test)

In [11]:
# PCA(components 9개)의 설명력
sum(pca.explained_variance_ratio_)

0.5349995581217004

In [12]:
# 데이터프레임으로 변환
pd.DataFrame(pca_train, columns=[f'PCA{i+1}' for i in range(N_COMPONENTS)])

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8
0,0.598167,-0.696391,0.385714,0.021177,-0.694376,0.413482,0.425079,-0.057985
1,0.415171,-1.785572,0.969295,-1.851573,-1.116640,-1.111638,0.611813,0.292474
2,-0.104198,11.868150,17.908060,-2.149997,0.549648,0.674557,2.719409,-0.813886
3,0.819653,-1.734550,0.954151,-1.808556,-1.157341,-1.200440,0.001180,0.107620
4,0.074495,-0.043358,-0.052675,-0.304237,-0.553127,-0.192067,0.406907,-0.190429
...,...,...,...,...,...,...,...,...
599995,-0.399919,1.053042,-0.778125,-0.424486,0.516459,-0.416782,0.230769,-0.612208
599996,-0.273116,-0.579035,0.260567,-0.585059,-0.359482,-0.055326,-0.314781,-0.005606
599997,-0.478335,-0.670449,0.308242,0.619397,-0.322310,0.163106,0.971858,0.099533
599998,-0.427842,-0.616242,0.275682,0.676224,-0.099920,0.224303,0.967554,0.058877


In [13]:
# Random Forest (변수 선택법 + PCA)
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

f1_score(y_test, y_pred), accuracy_score(y_test, y_pred)

(0.9839496221662469, 0.98407)