In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer
from tqdm import trange
from utils import *

2024-06-03 01:47:51.944730: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
SPLIT = 0.2
max_features = 5
th = 0.9

In [30]:
df, y = get_data()
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=SPLIT, random_state=SEED
)

In [31]:
to_drop = drop_colinear(X_train, th)
X_train = X_train.drop(columns=to_drop)
X_test = X_test.drop(columns=to_drop)
cols = X_train.columns
scaler = StandardScaler()
X_train_base = scaler.fit_transform(X_train)
X_test_base = scaler.transform(X_test)
y_train, y_test = y_train.values, y_test.values

In [42]:
C_values = [0.001, 0.01, 0.1, 1, 10]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
degree_values = [2, 3, 4, 5]  # Only used for 'poly' kernel
gamma_values = ['scale', 'auto']

In [43]:
results = []
for num in trange(1, max_features + 1):
    poly = PolynomialFeatures()
    model = RandomForestClassifier()
    selector = SelectFromModel(model, max_features=num, threshold=-np.inf)
    selector.fit(X_train_base, y_train)

    X_train = X_train_base[:, selector.get_support()]
    X_test = X_test_base[:, selector.get_support()]

    X_train = poly.fit_transform(X_train)
    X_test = poly.transform(X_test)

    for C in C_values:
        print(C)
        for kernel in kernel_values:
            for degree in degree_values if kernel == 'poly' else [3]:  # Default degree is 3
                for gamma in gamma_values:
                    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, probability=True, random_state=SEED)
                    model.fit(X_train, y_train)
                    y_pred = model.predict_proba(X_test)
                    acc, metric = eval_model(y_test, y_pred, num)
                    results.append([num, cols[selector.get_support()].tolist(), acc, metric, C, kernel, degree, gamma])


0.001
0.01
0.1
1
10
0.001
0.01
0.1
1
10


In [44]:
results = pd.DataFrame(
    results, columns=["num_features", "columns", "accuracy", "metric", "C", "kernel", "degree", "gamma"]
)

In [96]:
results.sort_values("metric", ascending=False, inplace=True)
results

Unnamed: 0,num_features,columns,accuracy,metric,C,kernel,degree,gamma
0,4,"[100, 102, 103, 105]",0.635,7400,10.000,poly,4,auto
2,4,"[100, 102, 103, 105]",0.639,7400,0.100,poly,4,auto
3,4,"[100, 102, 103, 105]",0.635,7400,10.000,poly,4,scale
1,4,"[100, 102, 103, 105]",0.639,7400,0.100,poly,4,scale
4,4,"[100, 102, 103, 105]",0.636,7350,1.000,poly,4,scale
...,...,...,...,...,...,...,...,...
345,5,"[100, 102, 103, 104, 105]",0.491,3900,0.001,poly,3,auto
346,5,"[100, 102, 103, 104, 105]",0.491,3900,0.001,linear,3,auto
347,5,"[100, 102, 103, 104, 105]",0.491,3900,0.001,linear,3,scale
348,5,"[100, 102, 103, 104, 105]",0.513,3850,1.000,poly,5,scale


In [97]:
results.sort_values("accuracy", ascending=False, inplace=True)
results

Unnamed: 0,num_features,columns,accuracy,metric,C,kernel,degree,gamma
29,5,"[100, 102, 103, 104, 105]",0.676,7150,0.01,poly,4,auto
24,5,"[100, 102, 103, 104, 105]",0.676,7150,0.01,poly,4,scale
6,5,"[100, 102, 103, 104, 105]",0.672,7300,0.01,rbf,3,auto
7,5,"[100, 102, 103, 104, 105]",0.672,7300,0.01,rbf,3,scale
14,5,"[100, 102, 103, 104, 105]",0.671,7250,0.01,poly,2,auto
...,...,...,...,...,...,...,...,...
251,1,[100],0.483,4600,10.00,sigmoid,3,auto
252,1,[100],0.483,4600,1.00,sigmoid,3,scale
253,1,[100],0.483,4600,1.00,sigmoid,3,auto
250,1,[100],0.482,4600,0.10,sigmoid,3,scale


In [47]:
# results.to_csv("../results/final_interactions.csv", index=False)

In [48]:
results = []
for num in trange(5):
    model = RandomForestClassifier()
    selector = SelectFromModel(model, max_features=num, threshold=-np.inf)
    selector.fit(X_train_base, y_train)

    X_train = X_train_base[:, selector.get_support()]
    X_test = X_test_base[:, selector.get_support()]

    for C in C_values:
        print(C)
        for kernel in kernel_values:
            for degree in degree_values if kernel == 'poly' else [3]:  # Default degree is 3
                for gamma in gamma_values:
                    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, probability=True, random_state=SEED)
                    model.fit(X_train, y_train)
                    y_pred = model.predict_proba(X_test)
                    acc, metric = eval_model(y_test, y_pred, num)
                    results.append([num, cols[selector.get_support()].tolist(), acc, metric, C, kernel, degree, gamma])


0.001


0.01
0.1
1
10
0.001
0.01
0.1
1
10


In [49]:
results = pd.DataFrame(
    results, columns=["num_features", "columns", "accuracy", "metric", "C", "kernel", "degree", "gamma"]
)

In [99]:
results.sort_values("metric", ascending=False, inplace=True)
results

Unnamed: 0,num_features,columns,accuracy,metric,C,kernel,degree,gamma
0,5,"[100, 102, 103, 104, 105]",0.677,7300,0.010,rbf,3,auto
4,5,"[100, 102, 103, 104, 105]",0.679,7300,0.001,linear,3,auto
5,5,"[100, 102, 103, 104, 105]",0.587,7300,0.001,poly,2,scale
1,5,"[100, 102, 103, 104, 105]",0.671,7300,0.010,poly,2,scale
3,5,"[100, 102, 103, 104, 105]",0.676,7300,0.010,rbf,3,scale
...,...,...,...,...,...,...,...,...
345,3,"[100, 102, 105]",0.495,4250,1.000,poly,5,auto
346,3,"[100, 102, 105]",0.491,4250,1.000,poly,5,scale
347,3,"[100, 102, 105]",0.491,4250,0.100,poly,5,auto
348,3,"[100, 102, 105]",0.491,4150,1.000,sigmoid,3,auto


In [100]:
results.sort_values("accuracy", ascending=False, inplace=True)
results

Unnamed: 0,num_features,columns,accuracy,metric,C,kernel,degree,gamma
55,5,"[100, 102, 103, 104, 105]",0.679,7100,0.010,sigmoid,3,scale
2,5,"[100, 102, 103, 104, 105]",0.679,7300,0.001,linear,3,scale
4,5,"[100, 102, 103, 104, 105]",0.679,7300,0.001,linear,3,auto
150,5,"[100, 102, 103, 104, 105]",0.678,6800,10.000,poly,2,auto
0,5,"[100, 102, 103, 104, 105]",0.677,7300,0.010,rbf,3,auto
...,...,...,...,...,...,...,...,...
332,1,[100],0.484,4950,10.000,sigmoid,3,scale
333,1,[100],0.482,4950,1.000,sigmoid,3,scale
336,1,[100],0.478,4900,10.000,sigmoid,3,auto
334,1,[100],0.478,4900,1.000,sigmoid,3,auto


In [11]:
# results.to_csv("../results/final.csv", index=False)

In [21]:
to_drop

[6, 7, 9, 8]

In [32]:
cols = [100, 102, 103, 104, 105]

In [39]:
df, y = get_data()
df = df.drop(columns=to_drop)[cols]
scaler = StandardScaler()
X_train_base = scaler.fit_transform(df)
y_train = y.values

In [40]:
X_test = pd.read_table("../data/x_test.txt", sep=" ", header=None)
X_test = X_test.drop(columns=to_drop)[cols]
X_test_base = scaler.transform(X_test)

In [41]:
poly = PolynomialFeatures()
X_train = poly.fit_transform(X_train_base)
X_test = poly.transform(X_test_base)

In [42]:
model = SVC(C=0.001, kernel="linear", degree=3, gamma="auto", probability=True, random_state=SEED)
model.fit(X_train, y_train)

In [43]:
y_pred = model.predict_proba(X_test)

In [135]:
idx = np.argsort(y_pred[:, 1])[-1000:]
idx

array([ 367, 3657, 2641, 1007, 3852, 4279, 1689, 1324, 3683, 4353, 3001,
       1787, 1068,  632, 1070, 3140,  428, 2222, 3675, 3847, 2075, 4710,
       4624, 1734, 2409, 3327, 2303, 4159, 3165, 3637, 3623, 4260, 3506,
       3120, 2049, 2490, 2876, 3159, 2482, 4252, 1236, 1234, 2874,   71,
       2932, 4322,   65, 2057, 2391, 2179, 3916, 2212,  427,  284, 1521,
       4286, 4899, 3192, 4859,  517, 2673, 4573, 1484, 1338, 1993, 3823,
        357,  488, 3553, 1077, 4131, 3791, 2548, 1155, 1554, 4054, 2639,
       3851, 1097, 1166, 3696, 3830, 3345, 1462, 4094, 3105, 1237, 4142,
       4452, 2190, 3446, 1424, 4407, 1931, 3868, 1890, 4347, 4234, 1940,
        605, 3354, 1897, 4232, 1963,   88, 4112, 1446, 3687, 2527, 1069,
        518, 3334, 1048, 1143,  446, 3301, 4189, 1607, 3743, 2093,  712,
       1255, 3566,  606, 3014, 2722,  703, 3169, 4474, 3908, 2547, 2735,
       3434, 2743, 4477,  777, 3948, 3634, 3666, 3310, 1361, 3280, 1214,
        571, 4643, 1749,  353,  165, 2839, 1022, 34

In [137]:
# write cols list to file
with open("../../313356_vars.txt", "w") as f:
    for col in cols:
        f.write(f"{col+1}\n")

In [138]:
with open("../../313356_obs.txt", "w") as f:
    for i in idx:
        f.write(f"{i+1}\n")