In [1]:
# Імпортуємо необхідні бібліотеки
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [9]:

# Генерація тестового набору даних на військову тематику
random.seed(42)

# Визначаємо кількість записів
n = 100

# Список для генерації даних
ranks = ['Private', 'Sergeant', 'Lieutenant', 'Captain', 'Major', 'Colonel']
missions_completed = [random.randint(0, 30) for _ in range(n)]
years_of_service = [random.randint(1, 25) for _ in range(n)]
age = [random.randint(20, 55) for _ in range(n)]
salary = [random.randint(25000, 80000) for _ in range(n)]
rank = [random.choice(ranks) for _ in range(n)]

# Створюємо DataFrame
military_data = pd.DataFrame({
    'age': age,
    'years_of_service': years_of_service,
    'missions_completed': missions_completed,
    'salary': salary,
    'rank': rank
})

# Виведемо початковий набір даних
print("Початковий тестовий набір даних на військову тематику:")
display(military_data)


Початковий тестовий набір даних на військову тематику:


Unnamed: 0,age,years_of_service,missions_completed,salary,rank
0,31,7,20,28315,Sergeant
1,52,22,3,67738,Lieutenant
2,26,9,0,60427,Private
3,39,23,23,79789,Sergeant
4,52,22,8,25967,Lieutenant
...,...,...,...,...,...
95,35,24,26,29801,Captain
96,24,24,11,60234,Colonel
97,48,9,5,38969,Colonel
98,55,17,11,58153,Sergeant


In [10]:

# 1. Трансформація даних: Робота з пропущеними значеннями
# Для демонстрації додамо кілька пропущених значень
military_data.loc[random.sample(range(n), 5), 'missions_completed'] = np.nan

display(military_data)


Unnamed: 0,age,years_of_service,missions_completed,salary,rank
0,31,7,20.0,28315,Sergeant
1,52,22,3.0,67738,Lieutenant
2,26,9,0.0,60427,Private
3,39,23,23.0,79789,Sergeant
4,52,22,,25967,Lieutenant
...,...,...,...,...,...
95,35,24,26.0,29801,Captain
96,24,24,11.0,60234,Colonel
97,48,9,5.0,38969,Colonel
98,55,17,11.0,58153,Sergeant


In [11]:

# Заповнення пропусків середнім значенням
military_data_filled = military_data.fillna(military_data['missions_completed'].mean())

print("\nНабір даних після заповнення пропусків середніми значеннями:")
display(military_data_filled)



Набір даних після заповнення пропусків середніми значеннями:


Unnamed: 0,age,years_of_service,missions_completed,salary,rank
0,31,7,20.000000,28315,Sergeant
1,52,22,3.000000,67738,Lieutenant
2,26,9,0.000000,60427,Private
3,39,23,23.000000,79789,Sergeant
4,52,22,13.189474,25967,Lieutenant
...,...,...,...,...,...
95,35,24,26.000000,29801,Captain
96,24,24,11.000000,60234,Colonel
97,48,9,5.000000,38969,Colonel
98,55,17,11.000000,58153,Sergeant


In [12]:

# 2. Нормалізація даних: Нормалізуємо числові значення 'salary' і 'missions_completed'
military_data_filled['salary_norm'] = (military_data_filled['salary'] - military_data_filled['salary'].mean()) / military_data_filled['salary'].std()
military_data_filled['missions_completed_norm'] = (military_data_filled['missions_completed'] - military_data_filled['missions_completed'].mean()) / military_data_filled['missions_completed'].std()

print("\nНабір даних після нормалізації:")
display(military_data_filled)



Набір даних після нормалізації:


Unnamed: 0,age,years_of_service,missions_completed,salary,rank,salary_norm,missions_completed_norm
0,31,7,20.000000,28315,Sergeant,-1.411568,7.991274e-01
1,52,22,3.000000,67738,Lieutenant,1.107741,-1.195603e+00
2,26,9,0.000000,60427,Private,0.640534,-1.547615e+00
3,39,23,23.000000,79789,Sergeant,1.877854,1.151139e+00
4,52,22,13.189474,25967,Lieutenant,-1.561616,2.084326e-16
...,...,...,...,...,...,...,...
95,35,24,26.000000,29801,Captain,-1.316606,1.503150e+00
96,24,24,11.000000,60234,Colonel,0.628201,-2.569065e-01
97,48,9,5.000000,38969,Colonel,-0.730729,-9.609291e-01
98,55,17,11.000000,58153,Sergeant,0.495216,-2.569065e-01


In [13]:

# 3. Перетворення категоріальних змінних (One-Hot Encoding) для звання
military_data_transformed = pd.get_dummies(military_data_filled, columns=['rank'])

print("\nНабір даних після перетворення категоріальних змінних (One-Hot Encoding):")
display(military_data_transformed)



Набір даних після перетворення категоріальних змінних (One-Hot Encoding):


Unnamed: 0,age,years_of_service,missions_completed,salary,salary_norm,missions_completed_norm,rank_Captain,rank_Colonel,rank_Lieutenant,rank_Major,rank_Private,rank_Sergeant
0,31,7,20.000000,28315,-1.411568,7.991274e-01,False,False,False,False,False,True
1,52,22,3.000000,67738,1.107741,-1.195603e+00,False,False,True,False,False,False
2,26,9,0.000000,60427,0.640534,-1.547615e+00,False,False,False,False,True,False
3,39,23,23.000000,79789,1.877854,1.151139e+00,False,False,False,False,False,True
4,52,22,13.189474,25967,-1.561616,2.084326e-16,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
95,35,24,26.000000,29801,-1.316606,1.503150e+00,True,False,False,False,False,False
96,24,24,11.000000,60234,0.628201,-2.569065e-01,False,True,False,False,False,False
97,48,9,5.000000,38969,-0.730729,-9.609291e-01,False,True,False,False,False,False
98,55,17,11.000000,58153,0.495216,-2.569065e-01,False,False,False,False,False,True


In [14]:

# 4. Масштабування даних (Min-Max Scaling) для 'years_of_service' та 'salary'
scaler = MinMaxScaler()

military_data_transformed[['years_of_service_scaled', 'salary_scaled']] = scaler.fit_transform(military_data_transformed[['years_of_service', 'salary']])

print("\nНабір даних після масштабування:")
display(military_data_transformed)



Набір даних після масштабування:


Unnamed: 0,age,years_of_service,missions_completed,salary,salary_norm,missions_completed_norm,rank_Captain,rank_Colonel,rank_Lieutenant,rank_Major,rank_Private,rank_Sergeant,years_of_service_scaled,salary_scaled
0,31,7,20.000000,28315,-1.411568,7.991274e-01,False,False,False,False,False,True,0.250000,0.058081
1,52,22,3.000000,67738,1.107741,-1.195603e+00,False,False,True,False,False,False,0.875000,0.779480
2,26,9,0.000000,60427,0.640534,-1.547615e+00,False,False,False,False,True,False,0.333333,0.645696
3,39,23,23.000000,79789,1.877854,1.151139e+00,False,False,False,False,False,True,0.916667,1.000000
4,52,22,13.189474,25967,-1.561616,2.084326e-16,False,False,True,False,False,False,0.875000,0.015115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,35,24,26.000000,29801,-1.316606,1.503150e+00,True,False,False,False,False,False,0.958333,0.085273
96,24,24,11.000000,60234,0.628201,-2.569065e-01,False,True,False,False,False,False,0.958333,0.642164
97,48,9,5.000000,38969,-0.730729,-9.609291e-01,False,True,False,False,False,False,0.333333,0.253038
98,55,17,11.000000,58153,0.495216,-2.569065e-01,False,False,False,False,False,True,0.666667,0.604084


In [16]:

# 5. Декомпозиція набору даних: Поділ на навчальну та тестову вибірки
train_set, test_set = train_test_split(military_data_transformed, test_size=0.2, random_state=42)

print("\nНавчальна вибірка:")
train_set.info()

print("\nТестова вибірка:")
test_set.info()



Навчальна вибірка:
<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 55 to 51
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      80 non-null     int64  
 1   years_of_service         80 non-null     int64  
 2   missions_completed       80 non-null     float64
 3   salary                   80 non-null     int64  
 4   salary_norm              80 non-null     float64
 5   missions_completed_norm  80 non-null     float64
 6   rank_Captain             80 non-null     bool   
 7   rank_Colonel             80 non-null     bool   
 8   rank_Lieutenant          80 non-null     bool   
 9   rank_Major               80 non-null     bool   
 10  rank_Private             80 non-null     bool   
 11  rank_Sergeant            80 non-null     bool   
 12  years_of_service_scaled  80 non-null     float64
 13  salary_scaled            80 non-null     float64
dtypes: bool(6), 

In [17]:

# 6. Аналіз кореляції ознак
correlation_matrix = military_data_transformed.corr()

print("\nМатриця кореляції:")
display(correlation_matrix)



Матриця кореляції:


Unnamed: 0,age,years_of_service,missions_completed,salary,salary_norm,missions_completed_norm,rank_Captain,rank_Colonel,rank_Lieutenant,rank_Major,rank_Private,rank_Sergeant,years_of_service_scaled,salary_scaled
age,1.0,-0.158676,0.109302,0.009517,0.009517,0.109302,-0.064462,-0.007916,0.06526,0.034201,-0.072535,0.025879,-0.158676,0.009517
years_of_service,-0.158676,1.0,-0.087853,-0.087003,-0.087003,-0.087853,0.083945,0.0246,0.063687,-0.014597,0.051565,-0.184797,1.0,-0.087003
missions_completed,0.109302,-0.087853,1.0,-0.057651,-0.057651,1.0,-0.039993,-0.070943,0.00061,0.229533,-0.019531,-0.083201,-0.087853,-0.057651
salary,0.009517,-0.087003,-0.057651,1.0,1.0,-0.057651,-0.116785,-0.051635,-0.161075,0.089621,-0.023754,0.242778,-0.087003,1.0
salary_norm,0.009517,-0.087003,-0.057651,1.0,1.0,-0.057651,-0.116785,-0.051635,-0.161075,0.089621,-0.023754,0.242778,-0.087003,1.0
missions_completed_norm,0.109302,-0.087853,1.0,-0.057651,-0.057651,1.0,-0.039993,-0.070943,0.00061,0.229533,-0.019531,-0.083201,-0.087853,-0.057651
rank_Captain,-0.064462,0.083945,-0.039993,-0.116785,-0.116785,-0.039993,1.0,-0.152312,-0.162142,-0.126886,-0.142327,-0.157243,0.083945,-0.116785
rank_Colonel,-0.007916,0.0246,-0.070943,-0.051635,-0.051635,-0.070943,-0.152312,1.0,-0.249707,-0.195411,-0.219189,-0.242161,0.0246,-0.051635
rank_Lieutenant,0.06526,0.063687,0.00061,-0.161075,-0.161075,0.00061,-0.162142,-0.249707,1.0,-0.208023,-0.233336,-0.25779,0.063687,-0.161075
rank_Major,0.034201,-0.014597,0.229533,0.089621,0.089621,0.229533,-0.126886,-0.195411,-0.208023,1.0,-0.1826,-0.201737,-0.014597,0.089621


In [18]:

# 7. Агрегація даних за кількістю років служби
grouped_data = military_data_transformed.groupby('years_of_service').agg({'salary': ['mean', 'count']})

print("\nАгрегація даних за кількістю років служби:")
display(grouped_data)



Агрегація даних за кількістю років служби:


Unnamed: 0_level_0,salary,salary
Unnamed: 0_level_1,mean,count
years_of_service,Unnamed: 1_level_2,Unnamed: 2_level_2
1,38571.0,2
2,62708.0,3
3,56846.5,4
4,47877.333333,3
5,51762.5,4
6,58553.75,4
7,30763.666667,3
8,56200.6,5
9,49865.875,8
10,25610.0,1
