In [None]:
import pandas as pd
import numpy as np

import pylab
import matplotlib

%matplotlib inline

np.random.seed(44)

In [None]:
feature_names = ["alcohol_content", "bitterness", "darkness"]

beer_kinds = ["pils", "pale ale", "stout"]

# centers of features
centers = {
    "pils": (
        4.5,
        30,
        1,
    ),
    "pale ale": (5.5, 35, 2),
    "stout": (7, 25, 5),
}

# std deviations of features:
deviations = {
    "pils": (0.2, 0.1, 0.5),
    "pale ale": (0.8, 0.2, 0.5),
    "stout": (1.0, 0.1, 0.7),
}


# feature fruitiness is redundant:
feature_names.append("fruitiness")


def sample_features(kind):
    # kind = "stout"
    means = 1.0 * np.array(centers[kind])
    stddevs = deviations[kind]
    # print(kind, means, stddevs)
    features = [max(0.0, m + s * np.random.randn()) for (m, s) in zip(means, stddevs)]
    # fruitiness correlates with hop and negatively with darkness:
    fruitiness = 0.1 * features[1] + (
        features[1] * 0.3 * features[0] + 2.2 * np.random.randn()
    )
    features.append(max(0, fruitiness))
    return features

In [None]:
sample_features("stout")

In [None]:
# rows per beer kind:
N = 100

import random

rows = []

ns = (100, 100, 100)
for i, (n, kind) in enumerate(zip(ns, beer_kinds)):
    rows.extend([sample_features(kind) for _ in range(n)])

random.shuffle(rows)

rows = np.array(rows)

# pylab.hist(rows[-100:, 2], bins=20)

# full_features also contain beer kind

features = pd.DataFrame(rows, columns=feature_names)
# features["fruitiness"] -= features["fruitiness"].min()
# features["darkness"] += 0.05 * features["alcohol_content"]
# features["bitterness"] -= 0.05 * features["darkness"]
features["bitterness"] += (
    -2.0 * features["darkness"]
    + 1.0 * features["alcohol_content"] ** 1.2
    + 1 * np.random.randn(rows.shape[0])
)
# features["bitterness"] -= np.min(features["bitterness"])

print(len(features), len(rows))
# features["fruitiness"] += 0.5 * (features["bitterness"] ** .9) + 0.03 * features.iloc[:, 0] + .5 * np.random.randn(rows.shape[0])


features["darkness"].hist()
features.describe()

In [None]:
features.head()

In [None]:
# compute score which we use for assigning class label:

features = features.drop("is_yummy", errors="ignore", axis=1)
# print(features)

weights_uwe = np.array((1.8, 0.2, -1.2, 0.1))
scores = np.array(features @ weights_uwe)

# add some non linear term to make svm work better than logistic regression:
scores = (
    scores + 1 + 1 * 0.005 * features.iloc[:, 0] ** 1.2
)  # - 0.001 *  (features.iloc[:, 1]  * features.iloc[:, 3])


print(scores.shape)

pylab.hist(scores, bins=30)


# add some noise:
scores += 1.0 * np.random.randn(len(scores))

# threshold is median of scores, so we get a balanced data set:
thresh = np.median(scores)
print(scores)
print(thresh)

# move some low scored beers towards the "center":
lowlim = sorted(scores)[len(scores) // 10]
scores[scores < lowlim] += 0.1 * np.median(scores)

good = scores > thresh
print(good)
bad = scores < thresh

print(sum(good), "good")
print(sum(bad), "bad")


labels = np.zeros(sum(ns), dtype=int)
labels[good] = 1

features["is_yummy"] = labels
# labels[:100] = 1

In [None]:
for_plot = features.copy()

# fixes seaborn labels issue


def translate_label(value):
    return "no" if value == 0 else "yes"


for_plot["is_yummy"] = for_plot["is_yummy"].apply(translate_label)

sns.pairplot(
    for_plot,
    hue="is_yummy",
    diag_kind="hist",
    plot_kws=dict(alpha=0.7),
    diag_kws=dict(alpha=0.7),
);
# beer_data.describe()

In [None]:
ix = np.arange(len(features))
random.shuffle(ix)
# print(ix)


features = features.iloc[ix]
labels = labels[ix]

features_learn = features.iloc[:225, :-1]
labels_learn = labels[:225]

features_eval = features.iloc[225:, :-1]
labels_eval = labels[225:]


def check(model):
    print(model.__class__.__qualname__)

    model.fit(features_learn, labels_learn)

    predicted = model.predict(features_learn)
    percent_correct = np.sum(predicted == labels_learn) / len(labels_learn)
    print("on learning set:", percent_correct)

    predicted = model.predict(features_eval)
    percent_correct = np.sum(predicted == labels_eval) / len(labels_eval)
    print("on eval set    :", percent_correct)
    print()


check(LogisticRegression(C=1.0))
check(SVC())

In [None]:
import seaborn as sns

sns.set(style="ticks")

for_plot = features.iloc[:, :-1].copy()
for_plot["label"] = ["class_" + li for li in labels.astype(str)]

for_plot.head()

# sns.pairplot(for_plot, hue="label", diag_kind="hist");

In [None]:
learn = features.iloc[:225, :]
learn.to_csv("../data/beers.csv", index=False)
for_eval = features.iloc[225:, :]
for_eval.to_csv("../data/beers_eval.csv", index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
beers = pd.read_csv("beers.csv")
beers.head()

In [None]:
features = learn.iloc[:, :-1]
labels = learn["is_yummy"]
features.head()

# first use of classifiers

In [None]:
model = LogisticRegression(C=1)
model.fit(features, labels)
predicted = model.predict(features)

percent_correct = np.sum(predicted == labels) / len(labels)
print(percent_correct)
print(model.coef_)
print(model.intercept_)

In [None]:
model = SVC()
model.fit(features, labels)

predicted = model.predict(features)

percent_correct = np.sum(predicted == labels) / len(labels)
print(percent_correct)

In [None]:
beers_eval = pd.read_csv("beers_eval.csv")
beers_eval.head()

In [None]:
features_eval = beers_eval.iloc[:, :-1]
labels_eval = beers_eval["is_yummy"]
features_eval.head()

# apply classifiers to test data set

In [None]:
# train model and eval on learning and test data set:


def check(model):
    print(model.__class__.__qualname__)
    model.fit(features, labels)

    predicted = model.predict(features)
    percent_correct = np.sum(predicted == labels) / len(labels)
    print("on learning set:", percent_correct)

    predicted = model.predict(features_eval)
    percent_correct = np.sum(predicted == labels_eval) / len(labels_eval)
    print("on eval set    :", percent_correct)
    print()


check(LogisticRegression(C=1))
check(SVC())

# cross validation

In [None]:
# now we merge both datasets

full_features = pd.concat((features, features_eval))
full_labels = pd.concat((labels, labels_eval))

In [None]:
from sklearn.model_selection import cross_val_score


def run_cross_val(model):

    print(model.__class__.__qualname__)
    scores = cross_val_score(model, full_features, full_labels, cv=5)
    print("mean score:", scores.mean())
    print("scores    :", scores)
    print()


run_cross_val(LogisticRegression())
run_cross_val(SVC())

## pipeline + crossvalidation

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
clf = make_pipeline(PCA(3), preprocessing.StandardScaler(), SVC(C=2))
run_cross_val(clf)

In [None]:
clf = make_pipeline(PCA(3), preprocessing.StandardScaler(), LogisticRegression())
run_cross_val(clf)

## hyperparameter tuning

In [None]:
pipeline = Pipeline(
    [
        ("pca", PCA()),
        ("scaler", preprocessing.StandardScaler()),
        ("clf", LogisticRegression()),
    ]
)


parameters = {
    "pca__n_components": (
        2,
        3,
        4,
    ),
    "scaler__with_mean": (True, False),  # unigrams or bigrams
    "scaler__with_std": (True, False),  # unigrams or bigrams
    "clf__penalty": ("l1", "l2"),
    "clf__C": (0.01, 0.05, 0.1, 0.2, 1, 5),
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)

grid_search.fit(full_features, full_labels);

In [None]:
print("best score:", grid_search.best_score_)
print("optimal parameters:", grid_search.best_params_)

In [None]:
parameters = {
    "pca__n_components": (2, 3, 4),
    "scaler__with_mean": (True, False),  # unigrams or bigrams
    "scaler__with_std": (True, False),  # unigrams or bigrams
    "clf__C": (1, 2, 3, 4, 5, 6, 7, 7.5, 8, 9, 10),
}


pipeline = Pipeline(
    [
        ("pca", PCA()),
        ("scaler", preprocessing.StandardScaler()),
        ("clf", SVC()),
    ]
)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
grid_search.fit(features, labels);

In [None]:
print("best score:", grid_search.best_score_)
print("optimal parameters:", grid_search.best_params_)

## using randomized search for large parameter space

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters = {
    "pca__n_components": (2, 3, 4),
    "scaler__with_mean": (True, False),  # unigrams or bigrams
    "scaler__with_std": (True, False),  # unigrams or bigrams
    "clf__C": np.arange(0.1, 10, 0.1),
}

In [None]:
grid_search = RandomizedSearchCV(
    pipeline, parameters, n_iter=100, n_jobs=-1, verbose=1, cv=5
)
grid_search.fit(features, labels);

In [None]:
print("best score:", grid_search.best_score_)
print("optimal parameters:", grid_search.best_params_)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

x0 = 1

y0 = 2
r0 = 5

np.random.seed(42)

N = 200

phis = 2 * np.random.random(N) * np.pi
rs = r0 + 3 * np.random.random(N)

xs = rs * np.cos(phis)
ys = rs * np.sin(phis)


fig = plt.figure(figsize=(4, 4))
plt.xlim([-8, 8])
plt.ylim([-8, 8])
plt.scatter(xs, ys, color="g")

r0 = 3

phis = 2 * np.random.random(N) * np.pi
rs = 5 * np.random.random(N)

xs2 = rs * np.cos(phis)
ys2 = rs * np.sin(phis)

plt.scatter(xs2, ys2, color="r")


# data = pd.DataFrame()
plt.show()

## circle dataset


In [None]:
np.random.seed(42)
points = np.random.random((300, 2)) * 4 - 2

r = np.random.random(len(points)) - 0.5
print(max(r))
print(min(r))
labels = 1.2 * points[:, 0] ** 2 + points[:, 1] ** 2 < 1.5 + 0.3 * r
colors = ["rb"[l] for l in labels]
plt.figure(figsize=(6, 6))
plt.scatter(points[:, 0], points[:, 1], color=colors, marker=".")

In [None]:
df = pd.DataFrame(points, columns=["x", "y"])
df["label"] = labels

In [None]:
df.to_csv("../cirle.csv", index=False)

In [None]:
# df = pd.DataFrame(dict(a = np.hstack((xs, xs2)), b = np.hstack((ys, ys2)), label = np.hstack((np.ones(N, int), np.zeros(N, int)))))

In [None]:
# df.to_csv("2d_points.csv", index=False)

In [None]:
!head 2d_points.csv

## xor example

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(43)
N = 500

x = np.random.random(N) - 0.5
y = np.random.random(N) - 0.5

thresh = 0.2 * (np.random.random(N) - 0.5)

x *= 4
y *= 4

l = x * y < thresh

colors = np.select([l == True, l == False], "gr")

fig = plt.figure(figsize=(4, 4))
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.scatter(x, y, color=colors, marker=".")

df = pd.DataFrame(dict(x=x, y=y, label=l))
df.to_csv("../data/xor.csv", index=False)

## spiral example

In [None]:
t = np.linspace(5, 15.5, 300)
r = (0.2 + t**1.5) / 20
xs = r * np.cos(t)
ys = r * np.sin(t)

fig = plt.figure(figsize=(4, 4))
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.scatter(xs, ys, marker=".");

In [None]:
%matplotlib inline

np.random.seed(43)
N = 200


min_ = 0.1
max_ = 0.1


def dist(x0, y0):
    return np.min((x0 - xs) ** 2 + (y0 - ys) ** 2)


points_red = []


while len(points_red) < N:
    x = np.random.random() * 6 - 3
    y = np.random.random() * 6 - 2.6
    if dist(x, y) > max_:
        points_red.append((x, y))

points_green = []
while len(points_green) < N:
    x = np.random.random() * 6 - 3
    y = np.random.random() * 6 - 2.6
    if dist(x, y) < min_:
        points_green.append((x, y))


colors = np.select([l == True, l == False], "gr")

fig = plt.figure(figsize=(4, 4))
# plt.xlim([-3, 3])
# plt.ylim([-2.6, 3.5]);
# plt.scatter(x, y, color=colors, marker=".");
plt.scatter(*zip(*points_red), color="red", marker=".")
plt.scatter(*zip(*points_green), color="blue", marker=".")

points = np.vstack((np.array(points_green), np.array(points_red)))
labels = np.hstack((np.ones(N), np.zeros(N)))[:, None]

print(points.shape, labels.shape)
labels.shape
data = np.hstack((points, labels))
df = pd.DataFrame(data, columns=["x", "y", "label"])
df["label"] = df["label"].astype(int)
df = df.sample(frac=1)
df.to_csv("../data/spiral.csv", index=False)

## regression: salmon

In [None]:
import numpy as np

np.random.seed(42)
N = 50

data1 = np.random.random((N, 4)) + 0.8

# circ, length, kind ,weight

# sockeye, orange


def atlantic():
    data1 = np.zeros((N, 4))
    data1[:, 1] = np.random.normal(120, 4, (N,))
    data1[:, 0] = 0.4 * data1[:, 1] ** 1.02 + np.random.normal(0, 0.8, (N,))
    data1[:, 2] = 0
    data1[:, 3] = data1[:, 0] ** 2 * data1[:, 1] / 2
    data1[:, 3] *= 0.00022 * (1 + np.random.normal(0, 0.05, (N,)))
    return data1


def sockeye():
    data1 = np.zeros((N, 4))
    data1[:, 1] = np.random.normal(60, 3, (N,))
    data1[:, 0] = 0.4 * data1[:, 1] ** 1.01 + np.random.normal(0, 0.7, (N,))
    data1[:, 2] = 1
    data1[:, 3] = data1[:, 0] ** 2 * data1[:, 1] / 2
    data1[:, 3] *= 0.0002 * (1 + np.random.normal(0, 0.05, (N,)))
    return data1


def chinook():
    data1 = np.zeros((N, 4))
    data1[:, 1] = np.random.normal(70, 5, (N,))
    data1[:, 0] = 1.2 * data1[:, 1] ** 1 + np.random.normal(0, 0.9, (N,))
    data1[:, 2] = 2
    data1[:, 3] = data1[:, 0] ** 2 * data1[:, 1] / 2
    data1[:, 3] *= 0.0002 * (1 + np.random.normal(0, 0.05, (N,)))
    return data1


data = np.vstack((atlantic(), chinook(), sockeye()))
# d#ata = data1
# data = chinook()
# data = sockeye()
# rint(data.shape)
# print(data)


# print(data)
# data[:, 0] = np.round(data[:, 0], 0)
# data[data[:, 0] < 10, 0] = 10

# data[:, 1] = np.round(data[:, 1], 0) / 2
# data[:, 3] = np.round(data[:, 3] / 1000, 1)


sns.set(style="ticks")


kinds = ["atlantic", "sockeye", "chinook"]

import pandas as pd

# print(data)

df = pd.DataFrame(data, columns=["circumference", "length", "kind", "weight"])
print(df.tail())
df["kind"] = df["kind"].apply(lambda v: kinds[int(v)])
print(df.describe())
# sns.pairplot(for_plot, hue="is_yummy", diag_kind="hist");
sns.pairplot(df, hue="kind", diag_kind="hist", diag_kws=dict(bins=20))


df = df.sample(frac=1)
df = df.sample(frac=1)
df.to_csv("../data/salmon.csv", index=False)

In [None]:
from sklearn.kernel_ridge import KernelRidge

from sklearn.preprocessing import LabelEncoder

df.iloc[:, 2] = LabelEncoder().fit_transform(df.iloc[:, 2])
df.head()

In [None]:
regressor = KernelRidge(alpha=0.01, kernel="rbf")

features = df.iloc[:, :-1]
values = df.iloc[:, -1]

In [None]:
regressor.fit(features, values)

In [None]:
np.mean(np.abs(regressor.predict(features) - values) / values)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import explained_variance_score

if 0:
    kr = GridSearchCV(
        SVR(kernel="rbf"),
        cv=5,
        param_grid={
            "epsilon": [1e-3, 1e-2, 1e-1, 1, 2],
            "gamma": [0.0001, 0.001],
            "C": [40, 50, 100, 200, 500],
        },
        scoring="explained_variance",
    )  # "neg_mean_squared_error")

kr = GridSearchCV(
    KernelRidge(kernel="rbf"),
    cv=5,
    param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
    scoring="explained_variance",
)

In [None]:
kr.fit(features, values)

In [None]:
print(kr.best_estimator_)
kr.best_score_

In [None]:
np.mean(np.abs(kr.best_estimator_.predict(features) - values) / values)

In [None]:
import matplotlib.pyplot as plt

plt.hist(np.abs(kr.best_estimator_.predict(features) - values) / values, bins=30);

In [None]:
import pandas as pd

words = "one two thee four five".split()

a = list(i * i for i in range(7))
b = list((ai * 1.1 for ai in a))
c = [words[i % len(words)] for i in range(len(a))]

df = pd.DataFrame(dict(a=a, b=b, c=c), columns=("a", "b", "c"))
print(df)

df.to_csv("example.csv", index=False)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# one hot encoding

In [None]:
import pandas as pd

features = pd.read_csv("../data/beers.csv")

style = [["pilsener", "ale", "stout"][i % 3] for i in range(len(features))]
features["style"] = style
features = features[
    ["alcohol_content", "bitterness", "darkness", "fruitiness", "style", "is_yummy"]
]


features.head()

features.to_csv("../data/beers_with_style.csv", index=False)

y = pd.get_dummies(features["style"], prefix="is")
features = features.drop("style", axis=1)

features = pd.concat([features, y], axis=1)

features = features[
    [
        "alcohol_content",
        "bitterness",
        "darkness",
        "fruitiness",
        "is_ale",
        "is_pilsener",
        "is_stout",
        "is_yummy",
    ]
]

features.to_csv("../data/beers_with_one_hot_encoding.csv", index=False)