In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data_train = pd.read_csv("./train.csv")
data_test = pd.read_csv("./test.csv")

In [3]:
financial_features = ['Adj Close', 'Close', 'High', 'Low', 'Open']

# Prvo, izbacujemo sve redove koji nemaju ni jednu financijsku značajku
data_train.dropna(subset=financial_features, how='all', inplace=True)

# Sortiramo podatke po dionici i datumu
data_train.sort_values(by=['Symbol', 'Date'], inplace=True)

# Popunjavamo nedostajuće vrijednosti
data_train.head()

data_train[financial_features] = data_train[financial_features].interpolate(method='linear')


# za znacajku Volume koristimo globalni medijan po dionici
median_volume_by_symbol = data_train.groupby('Symbol')['Volume'].median()

for index, row in data_train.iterrows():
    if pd.isnull(row['Volume']):
        data_train.at[index, 'Volume'] = median_volume_by_symbol[row['Symbol']]

print("Broj nedostajućih vrijednosti")
print(data_train.isnull().sum())


Broj nedostajućih vrijednosti
Date         0
Symbol       0
Adj Close    0
Close        0
High         0
Low          0
Open         0
Volume       0
Target       0
Id           0
dtype: int64


In [4]:
features = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
# Remove duplicates where 'Date' and 'Symbol' are the same and other features are also the same
data_train.drop_duplicates(subset=['Date', 'Symbol'] + features, keep='first', inplace=True)

# Identify and delete rows where 'Date' and 'Symbol' are the same but other features are different
duplicates_with_variations = data_train[data_train.duplicated(subset=['Date', 'Symbol'], keep=False)]
duplicates_to_remove = duplicates_with_variations[duplicates_with_variations.duplicated(subset=['Date', 'Symbol'] + features, keep=False) == False]
data_train = data_train.drop(duplicates_to_remove.index)

# Handling missing values by linear interpolation
data_train[features] = data_train[features].interpolate(method='linear')


In [5]:
features = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']

# Correcting rows with negative values for financial features using interpolation
for feature in features:
    negative_indices = data_train[data_train[feature] < 0].index
    data_train.loc[negative_indices, feature] = float('nan')  # Replace negative values with NaN
data_train[features] = data_train[features].interpolate(method='linear')

# Setting 'High' and 'Low' values to NaN where 'High' is less than 'Low'
mask_high_less_low = data_train['High'] < data_train['Low']
data_train.loc[mask_high_less_low, ['High', 'Low']] = float('nan')

# Interpolating to fill NaN values after adjustments
data_train[features] = data_train[features].interpolate(method='linear')

In [6]:
data_train.to_csv('data_clean.csv', index=False)


In [12]:
# get all nan values from data_test
nan_values = data_test[data_test.isna().any(axis=1)]
print("Nan values in test data: ", nan_values.shape[0])

print(nan_values['Date'].isna().sum())

# set all nan values to mean
# test_clean = data_test.fillna(data_test.mean())
# test_clean.to_csv('test_clean.csv', index=False)

# Select only the numeric columns
numeric_data = data_test.select_dtypes(include=[float, int])

# Calculate the mean of the numeric columns only
mean_values = numeric_data.mean()

# Fill missing values in numeric columns with their respective mean
numeric_data_filled = numeric_data.fillna(mean_values)

# Concatenate the filled numeric data back with the non-numeric data
data_clean = pd.concat([numeric_data_filled, data_test.select_dtypes(exclude=[float, int])], axis=1)




Nan values in test data:  2530
0


In [13]:
data_clean.describe()

data_clean.to_csv('test_clean.csv', index=False)