In [None]:
# python packages for calculation
import numpy as np
# python packages for database manipulation
import pandas as pd
# python visualization packages for statistics
import seaborn as sns
import os.path

# the main Python visualisation package
import matplotlib.pyplot as plt

# One of the best features of Jupyter notebook environment, output the visualisation frontend
%matplotlib inline

Data, Here we come
---
Data is alawys packed together to spread and be shared; one of popular format is called "CSV" format, plain text and displayed line by line and each column seperated by common mark:
1. open it if already exists,
- otherwise download it.

In [None]:
# Create table of features
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', \
                'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', \
                'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
cvsfile = \
  'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

if os.path.isfile("c2.1.1.1.csv"):
   df = pd.read_csv("c2.1.1.1.csv", index_col=0, dtype=int) 
else:
   # download data from remote database hosted at Universition of California, Irvine
   df = pd.read_csv(cvsfile, names = column_names )


If data is the native download, drop the useless data and save it for next use:

**Let us obserse the data which we just loaded**

In [None]:
df.info()

In [None]:
# show the man of package, function or others: put "?" ahead of behind the command
pd.read_csv?

In [None]:
# first five data
df.head(5)

In [None]:
# basic statistics for each features
df.describe()

In [None]:
# list number of total data and number of features of each data 
df.shape

<big>Final feature</big>, **Class**: 2 for benign, 4 for malignant)

In [None]:
df["Class2"] = df.Class.map({2: "B", 4: "M"})

In [None]:
df.head(5)

In [None]:
df=df.drop('Class2',axis=1)

In [None]:
df.head(5)

IPywidgets
---
```
Awesome sub-project in Jupyter: enhance the interactivity between UI and Programming itself!
```
Although the last version has come to 7.x.x, interested users should be urged to install the verion-6.x to prevant the confilicts between other third-party packages and ipywidgets.

Installation, by anaconda, a). UI installation b). by Python "pip" 

Data Display by IPyWidgets Design
---
There are two options availabe:

a). **Class** option: 3-category, "All", "2" for B, "4" for M;<br>
b). **number** of items: with slider bar, determine the size of data displayed simulataneously: 

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
items = ['All']+sorted(df['Class'].unique().tolist())

def view2(Class='',numbers=3):
    if Class=='All': return df.head(numbers)
    return df[df['Class']==Class].head(numbers)
    #return df.head(y)
a_slider = widgets.IntSlider(min=3, max=df.shape[0], step=1, value=5)
b_select =  widgets.Select(options=items)
widgets.interact(view2,numbers=a_slider,Class=b_select)

Convertion of data
---
from "object" to "int"

In [None]:
# Convert object to int data on sixth-feature 
df[df.columns[6]]=df[df.columns[6]].astype(int);

In [None]:
df.to_csv("chapter_2.1.1.1.csv")

In [None]:
df[10:15]

Note: **qgrid** not worked with ipywidgets-7.x.x

In [None]:
import qgrid
#qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)

In [None]:
qgrid.show_grid(df,show_toolbar=True, grid_options={'forceFitColumns': False, 'defaultColumnWidth': 200})

In [None]:
qgrid.show_grid(df)

In [None]:
#Explore correlations
plt.rcParams['figure.figsize']=(12,8)
sns.set(font_scale=1.4)


In [None]:
# slice dataframe by train_test_split in  sklearn.cross_valiation module
from sklearn.model_selection import train_test_split

# 75% randomly chosen data for training and left for testing
X_train, X_test, y_train, y_test = train_test_split(df[column_names[1:10]], df[column_names[10]], \
                                                    test_size=0.25, random_state=33)

Correlation of $X,Y$
---
$$\text{Correlation}(X,Y)=\frac{E(X-\mu_X)(Y-\mu_Y)}{\sigma_X\sigma_Y}$$

used to measure the relation between $X,Y$ statistics, 1 represents "totally positive related", -1 represents "totally negative related", and 0 "no relation at all".

In [None]:
plt.rcParams['figure.figsize']=(12,8)
sns.set(font_scale=1.4)

sns.heatmap(df.corr(), cmap='coolwarm')

Obviously, ```Sample code number```, the first feature, should be nothing matter with other features. Remake the corelation plot again based on the features exluded the first feature.

In [None]:
df1=df[column_names[1:]]

In [None]:
plt.rcParams['figure.figsize']=(12,8)
sns.set(font_scale=1.4)

sns.heatmap(df1.corr(), cmap='coolwarm')

<h4> Mean versions of the 10 Core Predictors </h4><br>
The below boxplots are of the "mean" value for the 10 core features in the dataset.  These are ranked as the most important features in the model we fit (see Feature Importances below) in terms of classifying the breast cancer mass as Malignant (4) or Benign (2). 

The charts reveal a tendency for the average value of a feature to be generally higher for malignant diagnoses vs. the benign class. This is true for every feature except for <b> Fractal Dimension Mean</b> which shows a flat difference between M and B diagnoses for the mean value of the feature.  <b>Radius Mean</b> on the other hand shows a more distinct distribution for (4) vs. (2) diagnoses, as is subsequently found to be the most important feature according to our fitted Random Forest model further below (see Feature Importances cell[9] &amp; [10] below).  

In [None]:
plt.rcParams['figure.figsize']=(10,5)
diagnosis='Class'
f, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5)
sns.boxplot(df.columns[10],y=df.columns[0],data=df, ax=ax1, palette='cubehelix')
sns.boxplot(df.columns[10],y=df.columns[1],data=df, ax=ax2, palette='cubehelix')
sns.boxplot(df.columns[10],y=df.columns[2],data=df, ax=ax3, palette='cubehelix')
sns.boxplot(df.columns[10],y=df.columns[3],data=df, ax=ax4, palette='cubehelix')
sns.boxplot(df.columns[10],y=df.columns[4],data=df, ax=ax5, palette='cubehelix')
f.tight_layout()



In [None]:
f, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5)
sns.boxplot(df.columns[10],y=df.columns[5],data=df, ax=ax1)
sns.boxplot(df.columns[10],y=df.columns[6],data=df, ax=ax2)
sns.boxplot(df.columns[10],y=df.columns[7],data=df, ax=ax3)
sns.boxplot(df.columns[10],y=df.columns[8],data=df, ax=ax4)
sns.boxplot(df.columns[10],y=df.columns[9],data=df, ax=ax5)
f.tight_layout()

In [None]:
plt.rcParams['figure.figsize']=(10,12)
for i in range(len(df.columns)-1):
    g = sns.FacetGrid(df, col=df.columns[10], hue=df.columns[10])
    g.map(sns.distplot, df.columns[i], hist=True, rug=True)


In [None]:

# category of training data
y_train.value_counts()



In [None]:

# category of test data  
y_test.value_counts()


Standardize data
---
$$ \mathbf{X\Rightarrow \frac{X-\mu_X}{\sigma_X} \sim Normal(\mu=0,\sigma^2=1)}$$
Theoretically, there is no effect in our calculation; but it does help to speed up the calculation!

Logistical Regression and SGD
---
1. Linear regression: $\mathbf{X, (y_X),W,b}=(X_i)_i,(W_i)_i ,(b_i)_i\to y=\mathbf{W^TX+b}$;
   - $\mathbf{(X,y_X)}$ real data, $y=\mathbf{W^TX+b}$ estimated data
   - $L=\sum(\mathbf{y_X-y})^2$, $L^2$ error
   - $ \min\limits_{W,b}L\Rightarrow \hat{\! W}, \hat{\! b}\Rightarrow \hat{\! y}=\mathbf{\hat{\!W}^TX+\hat{\!b}}$ ,
     the estimated data for $y_X$.
   - $y_X$ is in real range but not two-valued.  
- to make the conclusion, 0 for "no cancer", 1 for "yes", introduce the following logistion function:
$$ p(\mathbf{X})=\frac{1}{1+e^{-y}}=\frac{1}{1+e^{-(\mathbf{W^TX+b})}}$$
which transforms
$$y\in(-\infty,\infty)\to [0,1]\text{ and } [0,1/2)\to 0 (1/2,1]\to1$$
since the logistic curve arises fast about $x=1/2$ from alomst 0 to 1.
3. Maximaize the likelihood function of training data $\mathbf{(X_i,y_i)}$ by Stochastic Gradient Decendent method, (SGD):
$$\begin{array}{c}
     \mathbf{\text{argmax}}\\
     \mathbf{W}, b
   \end{array}
L({\mathbf{W},b})=
\begin{array}{c}
     \mathbf{\text{argmax}}\\
     \mathbf{W}, b
   \end{array}
\prod_\mathbf{(X_i,y_i)}p(\mathbf{X_i})^{y_i}
      (1-p(\mathbf{X_i}))^{1-y_i}$$

The best things are that sklearn provides all the functions doing all the calculation above.

<big style="font-size:2em;color:yellow;background-color:black;">
**🙁|😯**   <big style="font-size:0.7em;color:brown;background-color:white;padding:-2pt"> 
&nbsp;SDG &nbsp;
</big>
</big>
1. differential, difference: $\mathbf{df=f_x=f'(x) dx\approx\vartriangle f(x,h)\cdot h=\left(f(x+h)-f(x)\right)h}$
   - $f'(x)=0\Rightarrow f(x)$ attains maximum or minimum;
   - $f'(x)>0\nearrow$
   - $f'(x)<0\searrow$
- $\mathbf{\nabla f(X)=(f_1,f_2,\cdots,f_n)}$
  - along $\mathbf{\nabla f(X)}$, $\mathbf{f}$ increases fast;
  - along $\mathbf{-\nabla f(X)}$, $\mathbf{f}$ decreases fast;

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# Initialize LogisticRegression and SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()

# fitting parameter by LogisticRegression 
lr.fit(X_train, y_train)
# make prediction by above fitting
lr_y_predict = lr.predict(X_test)

# fitting parameters by SGDClassifier
sgdc.fit(X_train, y_train)
# make prediction by sgdc result
sgdc_y_predict = sgdc.predict(X_test)



In [None]:
# If want to competition, have to return report  
from sklearn.metrics import classification_report

# Logistic Model Accuracy
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
# three indexes from LogisticRegression
print(classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant']))


In [None]:
 # SGD accuracy
print('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test))
# its three indexes
print(classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant']))

Self-Training
---
Duplicate the procedures for the [Breast Cancer Data from Kaggle](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data)