## Machine Learning<br>
<b>Splitting the data in train and test sets
</b>

In [6]:
import random 
from typing import TypeVar, List,Tuple
x = TypeVar('x')
def split_data(data:List[x],prob:float) -> Tuple[List[x],List[x]]:
    data = data[:]
    random.shuffle(data)
    cut = int(len(data)*prob)
    return data[:cut],data[cut:]
data = [n for n in range(1000)]
train,test  = split_data(data,0.75) #75% is train data
print(train)
print("-------------------------------------------------------")
print(test)
print("-------------------------------------------------------")
print("Lengths of the sets formed:")
print("Train Set:",len(train))
print("Test Set:",len(test))

[640, 945, 892, 946, 221, 759, 779, 357, 207, 190, 637, 419, 981, 216, 159, 474, 404, 40, 877, 384, 586, 957, 228, 615, 483, 269, 675, 774, 903, 143, 802, 960, 218, 813, 45, 61, 998, 874, 189, 630, 291, 336, 455, 891, 882, 795, 465, 796, 949, 972, 375, 13, 830, 374, 318, 4, 270, 310, 717, 411, 547, 977, 254, 607, 140, 91, 36, 397, 191, 668, 203, 729, 861, 416, 526, 758, 52, 390, 855, 731, 330, 762, 78, 309, 911, 714, 647, 353, 534, 94, 723, 700, 905, 785, 220, 211, 2, 772, 584, 391, 333, 596, 821, 575, 359, 617, 145, 458, 743, 887, 751, 987, 367, 783, 702, 573, 674, 663, 79, 18, 470, 307, 301, 518, 930, 964, 97, 978, 163, 862, 757, 664, 293, 468, 60, 611, 196, 517, 692, 181, 195, 351, 493, 406, 66, 95, 520, 12, 5, 447, 297, 996, 210, 423, 883, 955, 730, 935, 296, 906, 806, 865, 116, 224, 848, 975, 688, 536, 219, 132, 997, 315, 713, 412, 991, 256, 974, 950, 624, 893, 46, 413, 869, 529, 567, 127, 739, 819, 471, 588, 689, 39, 850, 928, 76, 646, 437, 687, 452, 639, 500, 886, 616, 504, 450,

<b>Splitting the dataset like before but this time considering the variables too.</b>

In [14]:
Y = TypeVar('Y') # generic type to represent output variables
def train_test_split(xs: List[x],
                        ys: List[Y],
                        test_pct: float) -> Tuple[List[x], List[x], List[Y],
                        List[Y]]:
    
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    
    return ([xs[i] for i in train_idxs], 
            [xs[i] for i in test_idxs], 
            [ys[i] for i in train_idxs],
            [ys[i] for i in test_idxs]) 
xs = [x for x in range(1000)] 
ys = [2 * x for x in xs] 
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)
print("Lenghts of the splits:")
print("x_train Lenght:",len(x_train))
print("x_test Lenght:",len(x_test))
print("y_train Lenght:",len(y_train))
print("y_test Lenght:",len(y_test))

Lenghts of the splits:
x_train Lenght: 750
x_test Lenght: 250
y_train Lenght: 750
y_test Lenght: 250


#### Metrics
<ul>
    <li><b>Accuracy</b></li>
    <li><b>Precision</b></li>
    <li><b>Recall</b></li>
    <li><b>F1 Score (Harmonic Mean of Precision and Recall)</b></li>
    </ul>
    
#### Error types :
<ol>
    <li><b>TP</b>:True Positive</li>
    <li><b>FP</b>:False Positive</li>
    <li><b>TN</b>:True Negative</li>
    <li><b>FN</b>:False Negative</li>
    </ol>

In [15]:
def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)
def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)
def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2 * p * r / (p + r)

<b>High Bias and Low Variance: </b>typically corresponds to <i>underfitting</i>   (prolly the data doesn;t have enough features).<br>
<b>Low Bias and High Variance: </b>typically corresponds to <i>overfitting</i>   (porlly the data has too many features).<br>
<hr>
<ul>
    <li>High bias can be removed by adding more features to the dataset, getting more data won't be useful</li>
    <li>High variance cab be tackled by removing some features from the dataset or get more data.</li>
</ul>
<hr>
<b>Some Machine Learning models used in further notebooks-</b>
<ol>
    <li><b>The Naive Bayes classifier</b> suited to
yes-or-no features.</li>
    <li><b>Regression models</b>
require numeric features.</li>
    <li><b>Decision trees</b> deal
with numeric or categorical data.</li>
    </ol>