# Курсовой проект по курсу "Алгоритмы анализа данных"
_Е. Драгомирова_

_[Ссылка на страницу соревнования на kaggle](https://www.kaggle.com/c/gb-classification-choose-tutors)_

_**Ваша задача этом соревновании - предсказать вероятность того, подойдет ли репетитор для подготовки к экзамену по математике. Вам будут даны два датасета: train.csv (содержит признаки и целевую переменную) и test.csv (только признаки).**_

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Пути к файлам
TRAIN_DATA_PATH = 'train.csv'
TEST_DATA_PATH = 'test.csv'

In [3]:
# Загрузка данных
data = pd.read_csv(TRAIN_DATA_PATH, sep=",", engine='python')
data_test = pd.read_csv(TEST_DATA_PATH, sep=",", engine='python')

## Подготовка данных

In [4]:
feature_names = ['age', 'years_of_experience', 'lesson_price', 'qualification', 
                 'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
                'mean_exam_points']

In [5]:
X = data[feature_names]
X.shape

(10000, 11)

In [6]:
y = data['choose']
y.shape

(10000,)

In [7]:
X_final = data_test[feature_names]
X_final.shape

(10000, 11)

In [8]:
def standard_scale(X):
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    return (X - mean) / std

In [9]:
X_st = X.copy().astype(np.float64)
X_st = standard_scale(X_st)

In [10]:
X_final_st = X_final.copy().astype(np.float64)
X_final_st = standard_scale(X_final_st)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size=0.25, random_state=32)

## Обучение

In [12]:
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

In [13]:
def calc_logloss(y, y_pred):
    eps = 1e-8
    y_pred = np.clip(y_pred, eps, 1 - eps)
    err = - np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))
    return err

In [14]:
def eval_model(X, y, iterations, eta=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[1])
    n = X.shape[0]
    
    for i in range(iterations):
        z = np.dot(X, W)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        
        dQ = 1/n * X.T @ (y_pred - y)
        W -= eta * dQ
        if i % (iterations / 10) == 0:
            print(i, err)
            
    final_error = calc_logloss(y, y_pred)
    return W, final_error

In [15]:
W, error = eval_model(X_train, y_train, iterations=9000, eta=0.01)

0 1.2637958312041206
900 0.7107298731856446
1800 0.6696990567268916
2700 0.6622446046825584
3600 0.6596956480844083
4500 0.6587635689266265
5400 0.6584173774232048
6300 0.6582875495302885
7200 0.6582385260068684
8100 0.6582199175286336


In [16]:
def calc_pred_proba(W, X):
    y_pred_proba = sigmoid(np.dot(X, W))
    return y_pred_proba

In [17]:
y_pred = calc_pred_proba(W, X_test)
y_pred

array([0.51488924, 0.55219987, 0.71769506, ..., 0.43061205, 0.53101071,
       0.48349052])

## Задание

In [18]:
y_final_proba = calc_pred_proba(W, X_final_st)
y_final_proba

array([0.38113466, 0.69932752, 0.41028791, ..., 0.70135121, 0.85562779,
       0.52937488])

In [19]:
y_final = pd.DataFrame({'Id': data_test['Id'], 'choose': y_final_proba}, columns = ['Id', 'choose'])
y_final.head()

Unnamed: 0,Id,choose
0,10000,0.381135
1,10001,0.699328
2,10002,0.410288
3,10003,0.436602
4,10004,0.651316


## Сохранение результатов

In [20]:
y_final.to_csv('EDragomirova.csv', sep=',', index=False)