In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
import logging
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Suppress all warnings
warnings.filterwarnings("ignore")

# Disable LightGBM info messages
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [None]:
train_data = pd.read_csv("/kaggle/input/dapprojekt24-1/train.csv")
test_data = pd.read_csv("/kaggle/input/dapprojekt24-1/test.csv")


I believe ensembles are a good try to make good predictions. I personally really like 'Voting' principle so I will make five different ways for voting and use the best one for the test set.

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import ExtraTreesClassifier


In [None]:
ensembles = {
    'Voting': VotingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('dt', DecisionTreeClassifier()),
        ('gnb', XGBClassifier()),
    ], voting='hard')
}

In [None]:
all_features = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'Target']
features = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
train_data1 = train_data[all_features]
train_data1 = train_data1[(train_data1 >= 0).all(axis=1)]

train_data1.fillna(-1, inplace=True)

X = train_data1[features]
y = train_data1['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
results2 = []

best_model2 = None
best_f1_score2 = 0

for name, model in ensembles.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rounded_predictions = (predictions >= 0.5).astype(int)
    f1 = f1_score(y_test, rounded_predictions)
    print(f'{name} - F1 score: {f1}')
    
    if f1 > best_f1_score2:
        best_model2 = model
        best_f1_score2 = f1

    results2.append({'Model': name, 'F1 Score': f1})

print(f'Best model: {best_model2}')
print(f'Best F1 score: {best_f1_score2}')

In [None]:
X_test = test_data[features].fillna(-1)
IDs = test_data['Id']

predictions = best_model2.predict(X_test)
negative_mask = (X_test < 0).any(axis=1)
predictions[negative_mask] = 0

predictions_df = pd.DataFrame({'Id': IDs, 'TARGET': predictions})

predictions_df.to_csv('/kaggle/working/predictions.csv', index=False)

To prove that my code isn't all the same prediction except for na values, here is some statistics:

In [None]:
total_predictions = len(predictions)
total_zeros = np.sum(predictions == 0)
total_ones = np.sum(predictions == 1)

na_predictions = predictions[negative_mask]
na_total = len(na_predictions)
na_zeros = np.sum(na_predictions == 0)
na_ones = np.sum(na_predictions == 1)

non_na_predictions = predictions[~negative_mask]
non_na_total = len(non_na_predictions)
non_na_zeros = np.sum(non_na_predictions == 0)
non_na_ones = np.sum(non_na_predictions == 1)

In [None]:
stats = pd.DataFrame({
    'Category': ['Total', 'NA values', 'Non-NA values'],
    'Total': [total_predictions, na_total, non_na_total],
    'Zeros': [total_zeros, na_zeros, non_na_zeros],
    'Ones': [total_ones, na_ones, non_na_ones]
})

stats