# Answers to DS Topics Chapter 1
[Instruction](https://drive.google.com/drive/u/1/folders/1BRMTUJ8wtJWVV69RBOLk5r2P8_10mqre)

## Imports

In [102]:
import pandas as pd
import numpy as np
import plotly.express as pe
import plotly.graph_objects as go
from sklearn import metrics

## Constants

In [77]:
DATA_PATH = "CANCER_TABLE.csv"
DATA_COL_NAMES = ['diameter', 'cancer_real']

beta = 1
THRESH_MIN = 0.0
THRESH_MAX = 1.00001
THRESH_STEP = 0.1

In [133]:
# Reading the data
cancer_df = pd.read_csv(DATA_PATH, header=0, names=DATA_COL_NAMES)
cancer_df['cancer_real'].replace({False: 0, True: 1}, inplace=True)
cancer_df.head()

Unnamed: 0,diameter,cancer_real
0,6.309685,0
1,4.580894,0
2,6.310071,0
3,7.495139,0
4,2.216181,0


In [134]:
# Visualization
df_grouped = cancer_df.groupby('cancer_real').count()
fig = pe.pie(df_grouped, values=df_grouped['diameter'], 
             names=df_grouped.index, title='Cancer Diagnosis percentage')
fig.show()

In [135]:
# Model prediction
cancer_df['cancer_predicted'] = cancer_df['diameter'] > 7
cancer_df['cancer_predicted'].replace({False: 0, True: 1}, inplace=True)
cancer_df.head()

Unnamed: 0,diameter,cancer_real,cancer_predicted
0,6.309685,0,0
1,4.580894,0,0
2,6.310071,0,0
3,7.495139,0,1
4,2.216181,0,0


In [136]:
# Task 1+2: confusion matrix & TP TN FP FN
tp = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_predicted'] == 1)).sum()
tn = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_predicted'] == 0)).sum()
fp = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_predicted'] == 1)).sum()
fn = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_predicted'] == 0)).sum()

print("""
TP = {}
TN = {}
FP = {}
FN = {}
""".format(tp, tn, fp, fn))

pd.DataFrame(data=[[tp, fp], 
                   [fn, tn]],
            index=['Predicted True', 'Predicted False'],
            columns=['Real True', 'Real False'])


TP = 193
TN = 689
FP = 118
FN = 0



Unnamed: 0,Real True,Real False
Predicted True,193,118
Predicted False,0,689


* TP מסמל את כמות האנשים שיש להם סרטן והמודל חשב שיש להם סרטן
* TN מסמל את כמות האנשים שאין להם סרטן והמודל חשב שאין להם סרטן
* FP מסמל את כמות האנשים שאין להם סרטן אבל המודל חשב שיש להם
* FN מסמל את כמות האנשים שיש להם סרטן אבל המודל חשב שאין להם

In [138]:
# Task 3: TPR, FPR
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)

print("""
TPR = {}
FPR = {}
""".format(tpr, fpr))


TPR = 1.0
FPR = 0.14622057001239158



* TPR מראה את כמות האנשים שהמודל סיווג נכון כחולי סרטן מתוך כלל האנשים שחולים
* FPR מראה את כמות האנשים שהמושל סיווג לא נכון כחולים מתוך האנשים שלא חולים

In [139]:
# Task 4: accuracy, precision, recall
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)

print("""
accuracy = {}
precision = {}
recall = {}
""".format(accuracy, precision, recall))


accuracy = 0.882
precision = 0.6205787781350482
recall = 1.0



In [140]:
# Task 5: F1
f1_score = (1+beta**2)*(precision*recall)/(beta*precision+recall)
print("F1-score = {}".format(f1_score))

F1-score = 0.7658730158730159


Task 6
* המודל הזה יכול להיות טוב יותר כי הלקוח יוכל לבחור את ערך הסף בהתאם למה שיותר חשוב לו: לאבחן נכון חולי סרטן במחיר של יותר מקרים בהם מטופל בריא יתבשר כי הוא חולה או שהוא יעדיף פחות מקרים כאלה אבל לפספס כמות מסויימת של חולי סרטן אמיתיים.

In [185]:
# New model's prediction - ASK ABOUT THE LOGIC
cancer_df['cancer_prec'] = (cancer_df['diameter'] - cancer_df['diameter'].min())/(
cancer_df['diameter'].max() - cancer_df['diameter'].min())


# Task 7+8: ROC curve & AUC
possible_thresholds = np.arange(THRESH_MIN,THRESH_MAX, THRESH_STEP)
xs = []
ys = []

for t in possible_thresholds:
    ctp = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_prec'] > t)).sum()
    ctn = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_prec'] < t)).sum()
    cfp = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_prec'] > t)).sum()
    cfn = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_prec'] < t)).sum()
    ctpr = ctp/(ctp+cfn)
    cfpr = cfp/(cfp+ctn)
    xs.append(cfpr)
    ys.append(ctpr)

fig = go.Figure(data=go.Scatter(x=xs, y=ys))
fig.update_layout(title="New Model's ROC", 
                  xaxis_title="FPR",
                  yaxis_title="TPR",)
fig.show()


auc = 1 # Visible in graph
print("AUC = {}".format(auc))

AUC = 1


* בהצבת ערך סף נכון, המודל עובד בצורה אידיאלית על התצפיות הנתונות מבחינת הקריטריון הזה.

In [181]:
# Task 9: from ranking to classification
cancer_df['cancer_classification_2'] = cancer_df['cancer_prec'] > 0.8
cancer_df['cancer_classification_2'].replace({False: 0, True: 1}, inplace=True)
cancer_df.head()

Unnamed: 0,diameter,cancer_real,cancer_predicted,cancer_prec,cancer_classification_2
0,6.309685,0,0,0.629103,0
1,4.580894,0,0,0.4551,0
2,6.310071,0,0,0.629142,0
3,7.495139,0,1,0.74842,0
4,2.216181,0,0,0.217091,0


In [183]:
# Task 10+11: Calculate everything for the new model
tp = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_classification_2'] == 1)).sum()
tn = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_classification_2'] == 0)).sum()
fp = ((cancer_df['cancer_real'] == 0) & (cancer_df['cancer_classification_2'] == 1)).sum()
fn = ((cancer_df['cancer_real'] == 1) & (cancer_df['cancer_classification_2'] == 0)).sum()
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)

print("""
TP = {}
TN = {}
FP = {}
FN = {}
TPR = {}
FPR = {}
accuracy = {}
precision = {}
recall = {}
""".format(tp, tn, fp, fn, tpr, fpr, accuracy, precision, recall))

pd.DataFrame(data=[[tp, fp], 
                   [fn, tn]],
            index=['Predicted True', 'Predicted False'],
            columns=['Real True', 'Real False'])


TP = 193
TN = 807
FP = 0
FN = 0
TPR = 1.0
FPR = 0.0
accuracy = 1.0
precision = 1.0
recall = 1.0



Unnamed: 0,Real True,Real False
Predicted True,193,0
Predicted False,0,807


Task 12
* הייתי ממליץ על המודל השני כי כל הקריטריונים שלו טובים יותר מהמודל הראשון

In [189]:
# Task 13: ROC Curve with sklearn

fpr, tpr, thresholds = metrics.roc_curve(cancer_df['cancer_real'], cancer_df['cancer_prec'])
fig = go.Figure(data=go.Scatter(x=fpr, y=tpr))
fig.update_layout(title="New Model's ROC", 
                  xaxis_title="FPR",
                  yaxis_title="TPR",)
fig.show()