In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import scipy as sp
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-dark")
sns.set_palette("bright")
b, o, g, r, p, brown, pink, grey, y, lb = np.array(sns.color_palette())

mpl.rc('axes', labelsize=14, titlesize=16)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [3]:
import pandas as pd
data = pd.read_csv("/home/claym0re/Projects/python3/machine-learning/ml-algorithms/datasets/house-prices/train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
best_loss = 1
best_idx = 0
for idx in range(1,100):
    loss = np.mean([np.abs(np.rint(0.135*idx) - (0.135*idx)), np.abs(np.rint(0.296*idx) - (0.296*idx)), np.abs(np.rint(0.568*idx) - (0.568*idx))])
    if loss<best_loss:
        best_loss=loss
        best_idx=idx

print(best_loss, best_idx, (0.135*best_idx, 0.296*best_idx, 0.568*best_idx))

In [4]:
losses = list([])
idxl = []
max_n = 100
for idx in range(1,100):
    loss = np.mean([np.abs(np.rint(0.135*idx) - (0.135*idx)), np.abs(np.rint(0.296*idx) - (0.296*idx)), np.abs(np.rint(0.568*idx) - (0.568*idx))])
    losses.append([loss])

In [122]:
from scipy.special import softmax, expit

losses = np.array(losses)

def prob(loss, idx):
    return np.c_[idx.astype('int'), loss, 1-(expit(loss-1))]

n_students_prob = prob(losses[:,0].reshape(-1,), np.arange(1,100))

In [123]:
ldf = pd.DataFrame(n_students_prob, columns=["NoStudents", "Loss", "Probability"], index=np.arange(1,100))
ldf.sort_values("Probability", ascending=False)

Unnamed: 0,NoStudents,Loss,Probability
37,37.0,0.023000,0.726513
44,44.0,0.030667,0.724987
81,81.0,0.032333,0.724654
74,74.0,0.046000,0.721919
7,7.0,0.050333,0.721048
...,...,...,...
66,66.0,0.347333,0.657611
49,49.0,0.349667,0.657086
56,56.0,0.352000,0.656560
63,63.0,0.354333,0.656033
