In [4]:
import pandas as pd

def get_data():

    file_path_train = "artificial_train.data"

    df_train = pd.read_csv(file_path_train, sep=" ", header=None)
    df_train = df_train.drop(df_train.columns[-1], axis=1)

    X_train = df_train

    file_path_test = "artificial_test.data"

    df_test = pd.read_csv(file_path_test, sep=" ", header=None)
    df_test = df_test.drop(df_test.columns[-1], axis=1)

    X_test = df_test

    file_path_train_labels = "artificial_train.labels"

    df_train_labels = pd.read_csv(file_path_train_labels, sep=" ", header=None)

    y_train = df_train_labels

    return X_train, y_train, X_test

X_train, y_train, X_test = get_data()

print(f"Done downloading")


Done downloading


In [5]:
# 2.1 Klasyfikacja przy użyciu RandomForest

import numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

X_train_raw, y_train_raw, X_test_raw = get_data()
X_train, X_val, y_train, y_val = train_test_split(X_train_raw, y_train_raw, test_size=0.2, random_state=54)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train.values.ravel())

y_val_pred_rf = rf_classifier.predict(X_val)

balanced_accuracy_rf = balanced_accuracy_score(y_val, y_val_pred_rf)
print(f"Balanced Accuracy dla ręcznego modelu: {balanced_accuracy_rf}")


Balanced Accuracy dla ręcznego modelu: 0.6539115646258504


In [6]:
# 2.2 Tworzenie komitetu modeli

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=54)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=54)
svm_classifier = SVC(probability=True, random_state=54)

voting_classifier = VotingClassifier(
estimators=[('rf', rf_classifier), ('gb', gb_classifier), ('svm', svm_classifier)],voting='soft'
)

voting_classifier.fit(X_train, y_train.values.ravel())

y_val_pred_voting = voting_classifier.predict(X_val)

balanced_accuracy_voting = balanced_accuracy_score(y_val, y_val_pred_voting)
print(f"Balanced Accuracy dla modelu z komitetu: {balanced_accuracy_voting}")


Balanced Accuracy dla modelu z komitetu: 0.725890356142457


In [7]:
# 2.3 Wybór najważniejszych cech za pomocą RandomForest

from sklearn.feature_selection import SelectFromModel

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=54)
sfm = SelectFromModel(rf_classifier, max_features=50)
X_train_selected = sfm.fit_transform(X_train, y_train.values.ravel())
X_val_selected = sfm.transform(X_val)

rf_classifier_selected = RandomForestClassifier(n_estimators=100, random_state=54)
gb_classifier_selected = GradientBoostingClassifier(n_estimators=100, random_state=54)
svm_classifier_selected = SVC(probability=True, random_state=54)

voting_classifier_selected = VotingClassifier(
    estimators=[('rf', rf_classifier_selected), ('gb', gb_classifier_selected), ('svm', svm_classifier_selected)],
    voting='soft'
)

voting_classifier_selected.fit(X_train_selected, y_train.values.ravel())

y_val_pred_voting_selected = voting_classifier_selected.predict(X_val_selected)

balanced_accuracy_voting_selected = balanced_accuracy_score(y_val, y_val_pred_voting_selected)
print(f"Balanced Accuracy dla modelu z komitetu z wybranymi cechami: {balanced_accuracy_voting_selected}")


Balanced Accuracy dla modelu z komitetu z wybranymi cechami: 0.8004201680672269


In [8]:
# 2.4 Optymalizacja ilości cech

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=54)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=54)
svm_classifier = SVC(probability=True, random_state=54)

max_features_to_try = 200
step = 10

best_balanced_accuracy = 0
best_num_features = 0

for num_features in range(10, max_features_to_try + 1, step):
    sfm = SelectFromModel(rf_classifier, max_features=num_features)
    X_train_selected = sfm.fit_transform(X_train, y_train.values.ravel())
    X_val_selected = sfm.transform(X_val)

    voting_classifier_selected = VotingClassifier(
        estimators=[('rf', rf_classifier), ('gb', gb_classifier), ('svm', svm_classifier)],
        voting='soft'
    )

    voting_classifier_selected.fit(X_train_selected, y_train.values.ravel())

    y_val_pred_voting_selected = voting_classifier_selected.predict(X_val_selected)

    balanced_accuracy_voting_selected = balanced_accuracy_score(y_val, y_val_pred_voting_selected)

    print(f"Num Features: {num_features}, Balanced Accuracy: {balanced_accuracy_voting_selected}")

    if balanced_accuracy_voting_selected > best_balanced_accuracy:
        best_balanced_accuracy = balanced_accuracy_voting_selected
        best_num_features = num_features

print(f"Najlepsza ilość cech: {best_num_features}")
print(f"Balanced Accuracy dla modelu z komitetu z najlepszymi cechami: {best_balanced_accuracy}")


Num Features: 10, Balanced Accuracy: 0.8202280912364945
Num Features: 20, Balanced Accuracy: 0.8401360544217688
Num Features: 30, Balanced Accuracy: 0.825530212084834
Num Features: 40, Balanced Accuracy: 0.8155262104841936
Num Features: 50, Balanced Accuracy: 0.8004201680672269
Num Features: 60, Balanced Accuracy: 0.8053221288515406
Num Features: 70, Balanced Accuracy: 0.7753101240496199
Num Features: 80, Balanced Accuracy: 0.78531412565026
Num Features: 90, Balanced Accuracy: 0.7804121648659463
Num Features: 100, Balanced Accuracy: 0.7903161264505802
Num Features: 110, Balanced Accuracy: 0.7655062024809924
Num Features: 120, Balanced Accuracy: 0.7731592637054823
Num Features: 130, Balanced Accuracy: 0.7560024009603842
Num Features: 140, Balanced Accuracy: 0.7431472589035615
Num Features: 150, Balanced Accuracy: 0.738345338135254
Num Features: 160, Balanced Accuracy: 0.7507002801120448
Num Features: 170, Balanced Accuracy: 0.7507002801120448
Num Features: 180, Balanced Accuracy: 0.7507

In [1]:
# 2.5 Rozszerzona optymalizacja cech

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=54)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=54)
svm_classifier = SVC(probability=True, random_state=54)

assert(best_num_features > 5)

max_features_to_try_second = best_num_features + 5
min_feature_to_try_second = best_num_features - 5
step = 1

best_balanced_accuracy_second = 0
best_num_features_second = 0

for num_features in range(min_feature_to_try_second, max_features_to_try_second + 1, step):
    sfm = SelectFromModel(rf_classifier, max_features=num_features)
    X_train_selected = sfm.fit_transform(X_train, y_train.values.ravel())
    X_val_selected = sfm.transform(X_val)

    voting_classifier_selected = VotingClassifier(
        estimators=[('rf', rf_classifier), ('gb', gb_classifier), ('svm', svm_classifier)],
        voting='soft'
    )

    voting_classifier_selected.fit(X_train_selected, y_train.values.ravel())

    y_val_pred_voting_selected = voting_classifier_selected.predict(X_val_selected)

    balanced_accuracy_voting_selected = balanced_accuracy_score(y_val, y_val_pred_voting_selected)

    print(f"Num Features: {num_features}, Balanced Accuracy: {balanced_accuracy_voting_selected}")

    if balanced_accuracy_voting_selected > best_balanced_accuracy_second:
        best_balanced_accuracy_second = balanced_accuracy_voting_selected
        best_num_features_second = num_features

print(f"Najlepsza ilość cech: {best_num_features_second}")
print(f"Balanced Accuracy dla modelu z komitetu z najlepszymi cechami: {best_balanced_accuracy_second}")


Num Features: 15, Balanced Accuracy: 0.8326830732292917
Num Features: 16, Balanced Accuracy: 0.8301320528211285
Num Features: 17, Balanced Accuracy: 0.8401360544217688
Num Features: 18, Balanced Accuracy: 0.842687074829932
Num Features: 19, Balanced Accuracy: 0.8424869947979192
Num Features: 20, Balanced Accuracy: 0.8401360544217688
Num Features: 21, Balanced Accuracy: 0.8450380152060825
Num Features: 22, Balanced Accuracy: 0.8352340936374549
Num Features: 23, Balanced Accuracy: 0.8353341336534614
Num Features: 24, Balanced Accuracy: 0.8306322529011605
Num Features: 25, Balanced Accuracy: 0.8407362945178072
Najlepsza ilość cech: 21
Balanced Accuracy dla modelu z komitetu z najlepszymi cechami: 0.8450380152060825


In [2]:
# 2.6 Końcowy model

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=54)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=54)
svm_classifier = SVC(probability=True, random_state=54)
knn_classifier = KNeighborsClassifier(n_neighbors=5)
et_classifier = ExtraTreesClassifier(n_estimators=100, random_state=54)

num_features = best_num_features_second
et_classifier.fit(X_train, y_train.values.ravel())
sfm = SelectFromModel(et_classifier, max_features=num_features)
X_train_selected = sfm.fit_transform(X_train, y_train.values.ravel())
X_val_selected = sfm.transform(X_val)

voting_classifier_selected = VotingClassifier(
    estimators=[
        ('rf', rf_classifier),
        ('gb', gb_classifier),
        ('svm', svm_classifier),
        ('knn', knn_classifier),
        ('et', et_classifier)
    ],
    voting='soft'
)

voting_classifier_selected.fit(X_train_selected, y_train.values.ravel())

y_val_pred_voting_selected = voting_classifier_selected.predict(X_val_selected)

balanced_accuracy_voting_selected = balanced_accuracy_score(y_val, y_val_pred_voting_selected)

print(f"Num Features: {num_features}, Balanced Accuracy: {balanced_accuracy_voting_selected}")


Num Features: 21, Balanced Accuracy: 0.8700480192076832


In [11]:
X_test_selected = sfm.transform(X_test_raw)

test_predictions = voting_classifier_selected.predict(X_test_selected)

test_probabilities = voting_classifier_selected.predict_proba(X_test_selected)
positive_class_probabilities = test_probabilities[:, 1]

with open('predictions.txt', 'w') as file:
    for prob in positive_class_probabilities:
        file.write(f"{prob}\n")
        