In [272]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn import over_sampling, under_sampling
import matplotlib.pyplot as plt
# import seaborn as sns

In [273]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
SAMPLE_SUBMISSION_FILENAME = "sample_submission.csv"

In [274]:
train_dataset = pd.read_csv(TRAIN_FILENAME)
test_dataset = pd.read_csv(TEST_FILENAME)

In [275]:
train_dataset

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose
0,0,35.0,0.0,2150.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0
1,1,52.0,2.0,1250.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,57.0,1
2,2,29.0,3.0,1750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,0
3,3,33.0,3.0,1050.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,1
4,4,46.0,3.0,2250.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,55.0,2.0,2150.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,79.0,0
9996,9996,53.0,2.0,1350.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,0
9997,9997,44.0,5.0,1750.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,59.0,0
9998,9998,41.0,0.0,1700.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,72.0,0


**Проверим, нет ли выбросов/ошибок в данных**

In [276]:
# Посмотрим максимальный и минимальный возраст
max(train_dataset.age), min(train_dataset.age), max(test_dataset.age), min(test_dataset.age)

(68.0, 23.0, 68.0, 23.0)

In [277]:
# Посмотрим, что нет стажа, превышающего возраст
train_dataset[train_dataset['years_of_experience'] >= train_dataset['age']]

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points,choose


**Объявим переменные**

In [278]:
columns_to_drop = ['Id', 'physics', 'chemistry', 'biology', 'english', 'geography', 'history']
X_tr = pd.DataFrame(train_dataset).drop(columns=columns_to_drop + ['choose'])
Y_tr = train_dataset.choose.to_numpy()

X_test = pd.DataFrame(test_dataset).drop(columns=columns_to_drop)

In [279]:
X_tr

Unnamed: 0,age,years_of_experience,lesson_price,qualification,mean_exam_points
0,35.0,0.0,2150.0,2.0,74.0
1,52.0,2.0,1250.0,2.0,57.0
2,29.0,3.0,1750.0,1.0,66.0
3,33.0,3.0,1050.0,1.0,66.0
4,46.0,3.0,2250.0,2.0,73.0
...,...,...,...,...,...
9995,55.0,2.0,2150.0,2.0,79.0
9996,53.0,2.0,1350.0,1.0,59.0
9997,44.0,5.0,1750.0,1.0,59.0
9998,41.0,0.0,1700.0,2.0,72.0


In [280]:
X_test.cov()

Unnamed: 0,age,years_of_experience,lesson_price,qualification,mean_exam_points
age,64.512651,0.940214,40.65227,0.110835,1.559836
years_of_experience,0.940214,3.137209,240.8678,0.283371,5.120048
lesson_price,40.65227,240.8678,276949.686869,331.566364,5203.193765
qualification,0.110835,0.283371,331.566364,0.623537,8.148091
mean_exam_points,1.559836,5.120048,5203.193765,8.148091,184.147494


In [281]:
Y_tr

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

### Стандартизируем некоторые признаки

In [282]:
# Функция масштабирования выборки
def standard_scale(x):
    res = (x - x.mean(axis=0)) / x.std(axis=0)
    return res

In [283]:
def standardize_data(X):
    # Стандартизируем цену за занятие
    X['lesson_price'] = standard_scale(X['lesson_price'])
    
    # Стандартизируем среднее количество баллов за экзамен
    X['mean_exam_points'] = standard_scale(X['mean_exam_points'])
    
    # Стандартизируем возраст
    X['age'] = standard_scale(X['age'])
    
    # Стандартизируем стаж
    X['years_of_experience'] = standard_scale(X['years_of_experience'])
    
    return X

In [284]:
X_tr = standardize_data(X_tr)
X_test = standardize_data(X_test)

**Попробуем понизить размерность данных**

Найдем собственные векторы и собственные значения

In [285]:
covariance_matrix = X_tr.T @ X_tr

eig_values, eig_vectors = np.linalg.eig(covariance_matrix)

# сформируем список кортежей (собственное значение, собственный вектор)
eig_pairs = [(np.abs(eig_values[i]), eig_vectors[:, i]) for i in range(len(eig_values))]

# и отсортируем список по убыванию собственных значений
eig_pairs.sort(key=lambda x: x[0], reverse=True)

print('Собственные значения и собственные векторы в порядке убывания:')
for i in eig_pairs:
    print(i)

Собственные значения и собственные векторы в порядке убывания:
(39844.21250366247, array([-7.05164624e-04, -9.05277140e-02, -2.68319532e-01, -9.22992634e-01,
       -2.60563740e-01]))
(14972.543568147452, array([-0.0727892 , -0.4280895 , -0.58316009,  0.37409079, -0.57569219]))
(10223.575750437009, array([-0.92710743, -0.30197184,  0.14361282, -0.05647732,  0.15959478]))
(8378.736279178289, array([ 0.36756822, -0.84549298,  0.23030466, -0.06997771,  0.3034767 ]))
(2689.931898574729, array([-0.0081644 ,  0.04998739, -0.71712207,  0.00735589,  0.69506578]))


**Оценим долю дисперсии, описываемую каждой из компонент**

In [286]:
eig_sum = sum(eig_values)
var_exp = [(i / eig_sum) * 100 for i in sorted(eig_values, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print(f'Доля дисперсии, описываемая каждой из компонент \n{var_exp}')

# а теперь оценим кумулятивную (то есть накапливаемую) дисперсию при учитывании каждой из компонент
print(f'Кумулятивная доля дисперсии по компонентам \n{cum_var_exp}')

Доля дисперсии, описываемая каждой из компонент 
[52.351512309533035, 19.672500713644194, 13.432807881376728, 11.008863970329783, 3.5343151251162555]
Кумулятивная доля дисперсии по компонентам 
[ 52.35151231  72.02401302  85.4568209   96.46568487 100.        ]


**Наблюдение** \
Видим, что только часть компонент (4 из 11) вносит заметный вклад в итоговую дисперсию. \
Попробуем понизить размерность методом главных компонент

In [287]:
def pca_sdv(X, d=1):
    X_T_X = X.T @ X
    U, s, W = np.linalg.svd(X_T_X)

    # Транспонируем матрицу W
    V = W.T
    
    pairs = sorted([[idx, item] for idx, item in enumerate(s)], key=lambda item: item[1])
    pairs.reverse()
    
    pairs = np.array(pairs, dtype=int)
    
    max_pairs = pairs[:d]
    max_ids = max_pairs[:, 0]
    
    WEIGHTS = V[max_ids].T
    
    return X @ WEIGHTS

In [288]:
Z_tr = pca_sdv(X_tr, 4)
Z_test = pca_sdv(X_test, 4)

In [289]:
Z_tr

Unnamed: 0,0,1,2,3
0,-0.143196,-2.270545,-0.212098,1.476519
1,1.540915,-1.521265,0.529941,-0.813049
2,0.326157,-0.429366,1.060547,1.639762
3,1.564807,-0.070899,0.734967,1.255483
4,-0.196972,-1.729024,0.490671,-0.434305
...,...,...,...,...
9995,-0.065562,-1.993056,-0.484012,-1.243005
9996,0.995058,-0.737343,0.188143,-0.867710
9997,0.411436,-0.139634,1.588687,-0.511559
9998,0.653979,-2.086107,-0.430462,0.834324


**Обучим на получившихся данных модель методом kNN**

In [290]:
def e_metrics(x1, x2):

    distance = np.sum(np.square(x1 - x2))

    return np.sqrt(distance)

In [291]:
def knn(x_train, y_train, x_test, k, weights=None):
    answers = []
    y_train_set = set(y_train)
    for x in x_test:
        # расчет расстояний от классифицируемого объекта до
        # объектов обучающей выборки
        distances = np.sqrt(np.sum(np.square(x - x_train), axis=1))
            
        # создаем словарь со всеми возможными классами
        classes = {class_item: 0 for class_item in y_train_set}
            
        # больше оптимизации
        test_distances = np.c_[distances, y_train]
        sorted_test_distances = sorted(test_distances, key=lambda x: x[0])

        for i, d in enumerate(sorted_test_distances[0:k]):
            w = weights(i, d[0]) if weights else 1
            classes[d[1]] += w
        
        # Записываем в список ответов наиболее часто встречающийся класс
        answers.append(sorted(classes, key=classes.get)[-1])
        
    return answers

In [292]:
def accuracy(pred, y):
    return (sum(pred == y) / len(y))

In [293]:
def weights_n_neighbor(i, d):
    q = 0.6
    return q ** i

In [294]:
k = 100

y_pred_train = knn(Z_tr.to_numpy(), Y_tr, Z_tr.to_numpy(), k, weights_n_neighbor)
y_pred = knn(Z_tr.to_numpy(), Y_tr, Z_test.to_numpy(), k, weights_n_neighbor)

In [295]:
print(f'Точность алгоритма на трейне при k = {k}: {accuracy(y_pred_train, Y_tr):.3f}')

Точность алгоритма на трейне при k = 100: 0.932


In [296]:
result_data = {'Id': test_dataset['Id'].to_numpy(), 'choose': y_pred }
result_df = pd.DataFrame(result_data)

In [297]:
result_df.to_csv('result.csv', index=False)

In [298]:
result_df

Unnamed: 0,Id,choose
0,10000,0
1,10001,0
2,10002,0
3,10003,0
4,10004,0
...,...,...
9995,19995,0
9996,19996,0
9997,19997,0
9998,19998,1
