Test
=======

**\* Please copy & rename me from `test_jupyterlab_template.ipynb`**

- check primary tools if they work well.


ToDo:
* add test
    - `cython`, `numba`, `numexpr`, e.t.c.
* add function and modify bugs

----
## pandas & plot

In [None]:
import seaborn as sns
iris = sns.load_dataset('iris')

In [None]:
iris

In [None]:
# %matplotlib inline
# iris.plot()

### jupyterlab widget

In [None]:
%matplotlib widget
iris.plot()

### bokeh

In [None]:
import pandas_bokeh
pandas_bokeh.output_notebook()

In [None]:
iris.plot_bokeh()

### dask

In [None]:
import dask.dataframe as dd

ddf = dd.from_pandas(iris, chunksize=4)
ddf

In [None]:
ddf.compute()

#### (Appendix) matplotlib日本語化

Ref: https://qiita.com/maroKanatani/items/3b080c639395bba7795a

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

dummy_data = np.random.randn(300)
plt.hist(dummy_data)
# ひらがな、カタカナ、漢字、全角文字の全てが表示されることを確認（IPAexゴシックなど）
plt.xlabel("横方向の軸（X軸）")
plt.ylabel("縦方向の軸（X軸）")
plt.title("ヒストグラムのタイトル")
plt.show()

-----
## pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark

In [None]:
# iris: already created
df = spark.createDataFrame(iris)
df.show()
df.toPandas()

In [None]:
df.groupBy('species').count().toPandas()

In [None]:
df.groupBy('species').avg().toPandas()

### hive-db

In [None]:
spark.sql("create database test")
df.write.saveAsTable("test.iris", format="orc", compression="zlib")

In [None]:
df2 = spark.table("test.iris").show()

### GraphFrames

Ref: https://graphframes.github.io/graphframes/docs/_site/user-guide.html

In [None]:
# sparkContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark._sc) # or, sqlContext = SQLContext(spark)
sqlContext

In [None]:
from graphframes import GraphFrame

# spark = SparkSession.builder.appName('sample').getOrCreate()
# sqlContext = SQLContext(spark)

# Vertex DataFrame
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
  ("d", "David", 29),
  ("e", "Esther", 32),
  ("f", "Fanny", 36),
  ("g", "Gabby", 60)
], ["id", "name", "age"])
# Edge DataFrame
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("f", "c", "follow"),
  ("e", "f", "follow"),
  ("e", "d", "friend"),
  ("d", "a", "friend"),
  ("a", "e", "friend")
], ["src", "dst", "relationship"])
# Create a GraphFrame
g = GraphFrame(v, e)

g

In [None]:
from functools import reduce

from pyspark.sql.functions import col, lit, when
from pyspark.sql.types import IntegerType
from graphframes.examples import Graphs
g = Graphs(spark).friends()

g.vertices.show()
g.edges.show()

vertexInDegrees = g.inDegrees
vertexInDegrees.show()

g.vertices.groupBy().min("age").show()

numFollows = g.edges.filter("relationship = 'follow'").count()
print(numFollows)

# motif
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.filter("b.age > 30").show()

chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")

sumFriends =\
  lambda cnt,relationship: when(relationship == "friend", cnt+1).otherwise(cnt)
condition =\
  reduce(lambda cnt,e: sumFriends(cnt, col(e).relationship), ["ab", "bc", "cd"], lit(0))
chainWith2Friends2 = chain4.where(condition >= 2)
chainWith2Friends2.show()

# subgraph
g1 = g.filterVertices("age > 30").filterEdges("relationship = 'friend'").dropIsolatedVertices()
g1.vertices.show()
g1.edges.show()

paths = g.find("(a)-[e]->(b)")\
  .filter("e.relationship = 'follow'")\
  .filter("a.age < b.age")
e2 = paths.select("e.src", "e.dst", "e.relationship")
g2 = GraphFrame(g.vertices, e2)
g2.vertices.show()
g2.edges.show()

it may take long time to execute...

In [None]:
# page rank

# from graphframes.examples import Graphs
# g = Graphs(spark).friends()  # Get example graph

# # Run PageRank until convergence to tolerance "tol".
# results = g.pageRank(resetProbability=0.15, tol=0.01)
# # Display resulting pageranks and final edge weights
# # Note that the displayed pagerank may be truncated, e.g., missing the E notation.
# # In Spark 1.5+, you can use show(truncate=False) to avoid truncation.
# results.vertices.select("id", "pagerank").show()
# results.edges.select("src", "dst", "weight").show()

# # Run PageRank for a fixed number of iterations.
# results2 = g.pageRank(resetProbability=0.15, maxIter=10)

# # Run PageRank personalized for vertex "a"
# results3 = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="a")

# # Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel
# results4 = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["a", "b", "c", "d"], maxIter=10)

belows may not work...

(cause `Analysis Execption` in `Scala`)

In [None]:
# shortest path
# sqlContext = SQLContext(spark)
# from graphframes.examples import Graphs
# g = Graphs(sqlContext).friends()  # Get example graph

# results = g.shortestPaths(landmarks=["a", "d"])
# results.select("id", "distances").show()

In [None]:
# triangle count
# sqlContext = SQLContext(spark)
# from graphframes.examples import Graphs
# g = Graphs(sqlContext).friends()  # Get example graph

# results = g.triangleCount()
# results.select("id", "count").show()

ToDo:
- investigate the reasons of above `Analysis Exception`

----
## Geo

### folium

In [None]:
import folium
folium.Map()

### osmnx

Ref: https://github.com/gboeing/osmnx-examples/blob/master/notebooks/00-osmnx-features-demo.ipynb

In [None]:
import networkx as nx
import osmnx as ox
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
ox.config(use_cache=True, log_console=True)
ox.__version__

In [None]:
# get a graph for some city
G = ox.graph_from_place('Piedmont, California, USA', network_type='drive')
fig, ax = ox.plot_graph(G)

### GeoPandas

In [None]:
tags = {'amenity' : True,
        'landuse' : ['retail', 'commercial'],
        'highway' : 'bus_stop'}
gdf = ox.pois_from_place('Piedmont, California, USA', tags)
gdf.shape

In [None]:
gdf[gdf['highway']=='bus_stop'].dropna(axis=1, how='any').head()

In [None]:
gdf.plot()

### GeoViews

Ref: https://geoviews.org/

In [None]:
import geoviews as gv
import geoviews.feature as gf
import xarray as xr
from cartopy import crs

gv.extension('bokeh', 'matplotlib')

In [None]:
(gf.ocean + gf.land + gf.ocean * gf.land * gf.coastline * gf.borders).opts(
    'Feature', projection=crs.Geostationary(), global_extent=True, height=325).cols(3)

In [None]:
import geopandas as gpd
gv.Polygons(gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')), vdims=['pop_est', ('name', 'Country')]).opts(
    tools=['hover'], width=600, projection=crs.Robinson()
)

### geoplot

Ref: https://github.com/ResidentMario/geoplot/blob/master/examples/plot_boston_airbnb_kde.py

In [None]:
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
# import mplleaflet

boston_airbnb_listings = gpd.read_file(gplt.datasets.get_path('boston_airbnb_listings'))

ax = gplt.kdeplot(
    boston_airbnb_listings, cmap='viridis', projection=gcrs.WebMercator(), figsize=(12, 12),
    shade=True
)
gplt.pointplot(boston_airbnb_listings, s=1, color='black', ax=ax)
gplt.webmap(boston_airbnb_listings, ax=ax)
plt.title('Boston AirBnB Locations, 2016', fontsize=18)

fig = plt.gcf()

------
## ML

### sklearn

Ref: https://scikit-learn.org/stable/auto_examples/release_highlights/plot_release_highlights_0_23_0.html#sphx-glr-auto-examples-release-highlights-plot-release-highlights-0-23-0-py

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))

In [None]:
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
set_config(display='diagram')

num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())

cat_proc = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore'))

preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
                                       (cat_proc, ('feat0', 'feat2')))

clf = make_pipeline(preprocessor, LogisticRegression())
clf

In [None]:
import scipy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import completeness_score

rng = np.random.RandomState(0)
X, y = make_blobs(random_state=rng)
X = scipy.sparse.csr_matrix(X)
X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
kmeans = KMeans(algorithm='elkan').fit(X_train)
print(completeness_score(kmeans.predict(X_test), y_test))

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.inspection import plot_partial_dependence
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples = 500
rng = np.random.RandomState(0)
X = rng.randn(n_samples, 2)
noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise)

gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)

disp = plot_partial_dependence(
    gbdt_no_cst, X, features=[0], feature_names=['feature 0'],
    line_kw={'linewidth': 4, 'label': 'unconstrained'})
plot_partial_dependence(gbdt_cst, X, features=[0],
    line_kw={'linewidth': 4, 'label': 'constrained'}, ax=disp.axes_)
disp.axes_[0, 0].plot(X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples')
disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1)
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso
import numpy as np

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X, y = make_regression(n_samples, n_features, random_state=rng)
sample_weight = rng.rand(n_samples)
X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
    X, y, sample_weight, random_state=rng)
reg = Lasso()
reg.fit(X_train, y_train, sample_weight=sw_train)
print(reg.score(X_test, y_test, sw_test))

#### `pydotplus` -> png, PDF

Ref: https://mk-55.hatenablog.com/entry/2019/02/09/032646

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus
from IPython.display import Image
from graphviz import Digraph

clf = tree.DecisionTreeClassifier(max_depth=2)  # limit depth of tree
iris = load_iris()
clf.fit(iris.data, iris.target)

dot_data = tree.export_graphviz(
    clf,
    out_file=None,
    feature_names=['がく片の長さ','がく片の幅','花弁の長さ','花弁の幅'],
    class_names=iris.target_names,
    filled=True,
    proportion=True)
graph = pydotplus.graph_from_dot_data(dot_data)

#Setting font for Node 
graph.set_fontname('MS UI Gothic')

#Setting font for Node 
for node in graph.get_nodes():
    node.set_fontname('MS UI Gothic')

#Setting font for Edges 
for e in graph.get_edges():
    e.set_fontname('MS UI Gothic')

graph.write_pdf("pydotplus_test.pdf")
Image(graph.create_png())

##### (Appendix)pygraphviz

Ref: http://april.fool.jp/blogs/2013/12/17/graphviz%E3%81%A7%E3%82%AB%E3%83%83%E3%82%AF%E3%81%84%E3%81%84%E3%82%B0%E3%83%A9%E3%83%95%E3%82%92%E6%8F%8F%E3%81%93%E3%81%86/

In [None]:
from IPython.display import Image
import pygraphviz as pgv
G = pgv.AGraph()

G.add_node('a')
G.add_node('b')

G.add_edge('c','d')
G.add_edge('e','f')

G.layout()
G.draw('sample01.png')
G.draw("sample01.pdf")
G.write("sample01.dot")
Image("sample01.png")

ToDo:
* ~`pydot` -> PDF~
* `sklearn.external.six`, `StringIO`
* ~`pygraphviz`~

### xgboost

Ref: https://blog.amedama.jp/entry/2019/01/29/235642

In [None]:
import xgboost as xgb

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

"""XGBoost で early_stopping_rounds を使って学習ラウンド数を最適化するサンプルコード"""


def main():
    dataset = datasets.load_breast_cancer()
    X, y = dataset.data, dataset.target

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        shuffle=True,
                                                        random_state=42,
                                                        stratify=y)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
    }

    evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}
    bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=1000,
                    # 一定ラウンド回しても改善が見込めない場合は学習を打ち切る
                    early_stopping_rounds=10,
                    evals=evals,
                    evals_result=evals_result,
                    )

    y_pred_proba = bst.predict(dtest)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)
    acc = accuracy_score(y_test, y_pred)
    print('Accuracy:', acc)

    train_metric = evals_result['train']['logloss']
    plt.plot(train_metric, label='train logloss')
    eval_metric = evals_result['eval']['logloss']
    plt.plot(eval_metric, label='eval logloss')
    plt.grid()
    plt.legend()
    plt.xlabel('rounds')
    plt.ylabel('logloss')
    plt.show()


if __name__ == '__main__':
    main()

### lightgbm

Ref: https://blog.amedama.jp/entry/2018/05/01/081842

In [None]:
import lightgbm as lgb

from sklearn import datasets

import numpy as np

from matplotlib import pyplot as plt

"""LightGBM を使った多値分類のサンプルコード (CV)"""


def main():
    # Iris データセットを読み込む
    iris = datasets.load_iris()
    X, y = iris.data, iris.target

    # データセットを生成する
    lgb_train = lgb.Dataset(X, y)

    # LightGBM のハイパーパラメータ
    lgbm_params = {
        # 多値分類問題
        'objective': 'multiclass',
        # クラス数は 3
        'num_class': 3,
    }

    # 上記のパラメータでモデルを学習〜交差検証までする
    cv_results = lgb.cv(lgbm_params, lgb_train, nfold=10)
    cv_logloss = cv_results['multi_logloss-mean']
    round_n = np.arange(len(cv_logloss))

    plt.xlabel('round')
    plt.ylabel('logloss')
    plt.plot(round_n, cv_logloss)
    plt.show()


if __name__ == '__main__':
    main()

### catboost

Ref: https://catboost.ai/docs/concepts/python-usages-examples.html

In [None]:
from catboost import CatBoostRegressor
# Initialize data

train_data = [[1, 4, 5, 6],
              [4, 5, 6, 7],
              [30, 40, 50, 60]]

eval_data = [[2, 4, 6, 8],
             [1, 4, 50, 60]]

train_labels = [10, 20, 30]
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=2)
# Fit model
model.fit(train_data, train_labels)
# Get predictions
preds = model.predict(eval_data)

### optuna

In [None]:
import optuna

class Objective(object):
    def __init__(self, min_x, max_x):
        # Hold this implementation specific arguments as the fields of the class.
        self.min_x = min_x
        self.max_x = max_x

    def __call__(self, trial):
        # Calculate an objective value by using the extra arguments.
        x = trial.suggest_uniform('x', self.min_x, self.max_x)
        return (x - 2) ** 2

# Execute an optimization by using an `Objective` instance.
study = optuna.create_study()
study.optimize(Objective(-100, 100), n_trials=100)

### somoclu

Ref: https://somoclu.readthedocs.io/en/stable/example.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import somoclu
%matplotlib inline

In [None]:
c1 = np.random.rand(50, 3)/5
c2 = (0.6, 0.1, 0.05) + np.random.rand(50, 3)/5
c3 = (0.4, 0.1, 0.7) + np.random.rand(50, 3)/5
data = np.float32(np.concatenate((c1, c2, c3)))
colors = ["red"] * 50
colors.extend(["green"] * 50)
colors.extend(["blue"] * 50)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=colors)
labels = range(150)

In [None]:
n_rows, n_columns = 100, 160
som = somoclu.Somoclu(n_columns, n_rows, compactsupport=False)
%time som.train(data)

In [None]:
som.view_component_planes()

In [None]:
som.view_umatrix(bestmatches=True, bestmatchcolors=colors, labels=labels)

### bhtsne

ToDo:
* prepare example

### umap

Ref: https://qiita.com/cheerfularge/items/27a55ebde4a671880666

In [None]:
import umap
from sklearn.datasets import load_digits
from scipy.sparse.csgraph import connected_components
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.manifold import TSNE
import time

def main():
    digits = load_digits()
    digits.target = [float(digits.target[i]) for i in range(len(digits.target))]

    # UMAP
    start_time = time.time()
    embedding = umap.UMAP().fit_transform(digits.data)
    interval = time.time() - start_time
    plt.scatter(embedding[:,0],embedding[:,1],c=digits.target,cmap=cm.tab10)
    plt.colorbar()
    plt.savefig('umap.png')

    # t-SNE
    plt.clf()
    start_time2 = time.time()
    tsne_model = TSNE(n_components=2)
    tsne = tsne_model.fit_transform(digits.data)
    interval2 = time.time() - start_time2
    plt.scatter(tsne[:,0],tsne[:,1],c=digits.target,cmap=cm.tab10)
    plt.colorbar()
    plt.savefig('tsne.png')

    print('umap : {}s'.format(interval))
    print('tsne : {}s'.format(interval2))

if __name__ == "__main__":
    main()

### tensorflow2

https://www.tensorflow.org/tutorials/quickstart/beginner?hl=ja

WARNING:
* 原因は不明だが、上までの処理を行ってからtensorflowを読み込むと`tf.keras`を読み込めないなど意図しない挙動が起きている

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

tf.__version__

In [None]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=5)

model.evaluate(x_test,  y_test, verbose=2)

-----
## package versions

In [None]:
! conda env export -n base

In [None]:
# ! conda env export -n base | tee conda_packages_freeze.yml

----
- to clear output

In [None]:
# from IPython.display import clear_output
# clear_output()