In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Vytvoření třídy CustomRandomForest
class CustomRandomForest:
    def __init__(self, n_estimators, max_samples, max_depth):
        self.n_estimators =n_estimators
        self.max_samples = max_samples
        self.max_depth = max_depth
        self.models =[]

    def fit(self, X,y):
        n_samples = X.shape[0]
        for i in range(self.n_estimators):
            if isinstance(self.max_samples, float):
                sample_size = int(self.max_samples * n_samples)
            else:
                sample_size = self.max_samples
            sample_indices = np.random.choice(n_samples, sample_size, replace=True)
            Xsample = X.iloc[sample_indices]
            ysample = y.iloc[sample_indices]
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(Xsample, ysample)
            self.models.append(tree)

    def predict(self, X):
        predictions = [tree.predict(X) for tree in self.models]
        return np.mean(predictions, axis=0)

In [4]:
#načtení souboru
df=pd.read_csv("data.csv")
df

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2714,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2715,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2716,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [5]:
df.isnull().sum()

Unnamed: 0,0
Country,0
Year,0
Status,0
Life expectancy,0
Adult Mortality,0
infant deaths,0
Alcohol,160
percentage expenditure,0
Hepatitis B,531
Measles,0


In [None]:
#Nalezení číselných sloupců a jejich nahrazení průměrem
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
df = pd.get_dummies(df, columns=['Status'], drop_first=True)

In [9]:
#Konverze kategorických příznaků na číselné kódy
object_columns = df.select_dtypes(include=['object']).columns
for col in object_columns:
    df[col] = df[col].astype('category').cat.codes

# Nahrazení chybějících hodnot ve sloupci age mediánem
if 'age' in df.columns:
    df['age'] = df['age'].fillna(df['age'].median())

In [10]:
#Nahrazení chybějících hodnot -1
df = df.replace(np.nan, -1)

In [11]:
X = df.drop(columns=['Life expectancy', 'Country', 'Year'])
y = df['Life expectancy']

# Rozdělení na trénovací a testovací množinu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Rozdělení testovací množiny na validační a testovací množinu
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [12]:
print("Rozměry dat:")
print(f"Trénovací: {X_train.shape}, Validační: {X_val.shape}, Testovací: {X_test.shape}")

Rozměry dat:
Trénovací: (2174, 19), Validační: (272, 19), Testovací: (272, 19)


In [13]:
# Definice modelů
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Custom Random Forest": CustomRandomForest(n_estimators=10, max_samples=0.8, max_depth=5)
}

# Trénování a hodnocení modelů
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    results.append((name, rmse, mae))
    print(f"{name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}")

Linear Regression -> RMSE: 4.33, MAE: 3.19
Ridge Regression -> RMSE: 4.33, MAE: 3.19


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Custom Random Forest -> RMSE: 2.74, MAE: 2.02


In [14]:
# Vyhodnocení nejlepšího modelu
best_model = models["Custom Random Forest"]
y_test_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"\nNejlepší model: Custom Random Forest -> Testovací RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}")


Nejlepší model: Custom Random Forest -> Testovací RMSE: 2.34, MAE: 1.70


In [16]:
eval_df=pd.read_csv("evaluation.csv")

In [17]:
# Předzpracování vyhodnocovacích dat
object_columns = eval_df.select_dtypes(include=['object']).columns
for col in object_columns:
    eval_df[col] = eval_df[col].astype('category').cat.codes
eval_df = eval_df.replace(np.nan, -1)

In [20]:
for col in X_train.columns:
    if col not in eval_df.columns:
        eval_df[col] = 0

# Odstranění nadbytečných sloupců
eval_df = eval_df[X_train.columns]

# Predikce na vyhodnocovacích datech
evaluation_X = eval_df
evaluation_predictions = best_model.predict(evaluation_X)
results_df = pd.DataFrame({
    "Country": pd.read_csv("evaluation.csv")["Country"],
    "Year": pd.read_csv("evaluation.csv")["Year"],
    "Life expectancy": evaluation_predictions
})

# Uložení výsledných predikcí do csv
results_df.to_csv("results.csv", index=False)
