In [None]:
# imports

import pandas as pd
import numpy as np
import tensorflow as tf
from matplotlib import __version__ as mpl_version
import matplotlib.pyplot as plt
from sklearn import __version__ as skl_version, exceptions
from sklearn import preprocessing, decomposition
from sklearn import metrics, model_selection, feature_extraction, feature_selection
from sklearn import naive_bayes, tree, ensemble, neighbors, linear_model, neural_network, svm, dummy
import nltk
import keras

from warnings import simplefilter
from sys import version as py_version
from collections import Counter
import re

simplefilter("ignore", category=exceptions.ConvergenceWarning)

nltk.download('stopwords')

print(f"python:     {py_version}")
print(f"pandas:     {pd.__version__}")
print(f"numpy:      {np.__version__}")
print(f"matplotlob: {mpl_version}")
print(f"tensorflow: {tf.__version__}")
print(f"keras:      {keras.__version__}")
print(f"sklearn:    {skl_version}")


In [None]:
# data loading
index_col = "movieId"
X_train = pd.read_csv("./data/train_features.tsv", sep="\t", 
                      index_col=index_col, na_filter=False)
Y_train = pd.read_csv("./data/train_labels.tsv", sep="\t", 
                      index_col=index_col, na_filter=False).values.ravel()
X_valid = pd.read_csv("./data/valid_features.tsv", sep="\t", 
                      index_col=index_col, na_filter=False)
Y_valid = pd.read_csv("./data/valid_labels.tsv", sep="\t", 
                      index_col=index_col, na_filter=False).values.ravel()
X_test = pd.read_csv("./data/test_features.tsv", sep="\t", 
                     index_col=index_col, na_filter=False)


In [None]:
# basic property extraction
cols = X_train.columns.values.tolist()
avf_cols = [ col for col in cols if "avf" in col ]
ivec_cols = [ col for col in cols if "ivec" in col ]
avf_ivec_cols = avf_cols + ivec_cols

titles = Counter(word.lower() 
                 for title in X_train["title"]
                 for word in re.split("\W+", re.sub("([^\W\s])", "\\1", title))
                 if word and word not in set(nltk.corpus.stopwords.words("english")))
titles = set(title for title, _ in titles.most_common(100))

tags = set(tag.lower()
           for taglist in X_train["tag"]
           for tag in re.sub("[,_]", " ", taglist).split())

genres = list(set(Y_train))


In [None]:
# data preprocessing
split_comma = lambda x: x.split(",")

tf_title = feature_extraction.text.TfidfVectorizer(
    strip_accents="ascii"
).fit(X_train["title"].apply(lambda x: x.replace(",", " ")))
def get_titles(df, tf=tf_title):
    return pd.DataFrame(tf.transform(df["title"].apply(lambda x: x.replace(",", " "))).toarray(), 
                        columns=[f"title_{f}" for f in tf.get_feature_names()],
                        index=df.index)

tf_tag = feature_extraction.text.TfidfVectorizer(
    strip_accents="ascii"
).fit(X_train["tag"].apply(lambda x: re.sub("[,_]", " ", x)))
def get_tags(df, tf=tf_tag):
    return pd.DataFrame(tf.transform(df["tag"].apply(lambda x: re.sub("[,_]", " ", x))).toarray(), 
                        columns=[f"tag_{f}" for f in tf.get_feature_names()],
                        index=df.index)

sc_mm = preprocessing.MinMaxScaler().fit(X_train[avf_ivec_cols])
sc_s = preprocessing.StandardScaler().fit(X_train[avf_ivec_cols])
def get_ivec_avf(df, sc=sc_mm):
    return pd.DataFrame(sc.transform(df[avf_ivec_cols]), 
                        columns=avf_ivec_cols, 
                        index=df.index)

def get_one_hot(data, label, features):
    df = pd.DataFrame(index=data.index)
    for feature in features:
        df[f"{label}_{feature}"] = df.apply(lambda x: int(feature in x), axis=1)
    return df

def preprocess(data):
    return [
        get_ivec_avf(X).join([
            get_tags(X)
#           get_titles(X)
#           get_one_hot(X["tag"].apply(split_comma), "tag", tags)
#           get_one_hot(X["title"].apply(split_comma), "title", titles)
        ]) for X in data
    ]

X_tr, X_va, X_te = preprocess([X_train, X_valid, X_test])

X_size = X_tr.shape[1]
Y_size = len(genres)

Y_tr = keras.utils.to_categorical(np.array([genres.index(x) for x in Y_train]))
Y_va = keras.utils.to_categorical(np.array([genres.index(x) for x in Y_valid]))

# joint train and valid data sets
X = pd.concat([X_tr, X_va])
Y = np.concatenate([Y_train, Y_valid])
Y_ = np.concatenate([Y_tr, Y_va])
# cross vlaidation split if the use of X, Y/Y_ is required (GridSearchCV)
cv = model_selection.PredefinedSplit([-1 if x in X_tr.index else 0 for x in X.index])

_ = X_tr.info(), X_va.info(), X_te.info()

In [None]:
# baseline model performance
zero_rule = dummy.DummyClassifier(strategy='most_frequent')
one_rule = tree.DecisionTreeClassifier(max_depth=1)

for model, name in [(zero_rule, "zero rule"), (one_rule, "one_rule")]:
    model.fit(X_tr, Y_train)
    print(name)
    print(metrics.classification_report(Y_valid, model.predict(X_va), zero_division=0))
    

In [None]:
# model playground

# model = tree.DecisionTreeClassifier()
# model = ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100)
# model = naive_bayes.MultinomialNB()
# model = linear_model.Perceptron()
# model = svm.SVC(
#     C=10,
#     kernel="poly", 
#     degree=3,
#     gamma=0.05
# )
model = neural_network.MLPClassifier(max_iter=1500,
                                     hidden_layer_sizes=(80,),
                                     solver="sgd")
model.fit(X_tr, Y_train)
print(model)

pred = model.predict(X_va)
print(metrics.classification_report(Y_valid, pred, zero_division=0))

pred = pd.DataFrame(
    model.predict(X_te), 
    index=X_te.index, 
    columns=["genres"]
)
with open("Y_test.csv", "w") as f:
    f.write(pred.to_csv().replace("\r", ""))


In [None]:
# hyperparameter tuning
def fit_grid(model, params, X=X, Y=Y, cv=cv):
    grid = model_selection.GridSearchCV(
        estimator=model, 
        param_grid=params, 
        cv=cv,
        n_jobs=-1
    )
    
    return grid.fit(X, Y)

mlp_params = {
    "hidden_layer_sizes": [
        (n,) * i for i in range(1, 6) for n in range(20, 101, 20)
    ],
    "solver": ["adam", "sgd", "lbfgs"],
    "activation": ["identity", "logistic", "tanh", "relu"],
    "alpha": [ 0.1 ** i for i in range(1, 6) ],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "max_iter": list(range(100, 3001, 100))
}

svc_params = {
    "C": [ 10 ** i for i in range(3, -4, -1)],
    "kernel": ["linear", "poly", "sigmoid"],
    "degree": list(range(2, 9)),
    "gamma": ["auto", "scale"]
}


In [None]:
# data for below
result_ = fit_grid(
    neural_network.MLPClassifier(max_iter=1500, solver="sgd"), 
    {k: mlp_params[k] for k in ["hidden_layer_sizes"]}
)

means = result_.cv_results_["mean_test_score"]
params = result_.cv_results_["params"]
print(f"Best: {result_.best_score_:.4} {result_.best_params_}")
for mean, param in zip(means, params):
    print(f"{mean:.4} - {param}")


In [None]:
# node configuration heat map
arr = [means[i*5:(i+1)*5] for i in range(len(means) // 5)]
_, ax = plt.subplots()
plt.imshow(arr, cmap="viridis")
ax.set_xticklabels([i for i in range(0, 101, 20)])
ax.set_yticklabels([str(i) for i in range(0, 6)])
plt.xlabel("Nodes per Layer")
plt.ylabel("Layers")
plt.colorbar()
plt.savefig("./figs/mlp_config_heatmap")
plt.show()

In [None]:

result_ = fit_grid(
    neural_network.MLPClassifier(max_iter=2000), 
    mlp_params
)

means = result_.cv_results_["mean_test_score"]
params = result_.cv_results_["params"]
print(f"Best: {result_.best_score_:.4} {result_.best_params_}")
for mean, param in zip(means, params):
    print(f"{mean:.4} - {param}")

In [None]:

result__ = fit_grid(
    svm.SVC(kernel="poly", C=10),
    svc_params
)

deg_means = result__.cv_results_["mean_test_score"]
params = result__.cv_results_["params"]
print(f"Best: {result__.best_score_:.4} {result__.best_params_}")
for mean, param in zip(deg_means, params):
    print(f"{mean:.4} - {param}")
    


In [None]:
# metrics
def iter_vals(X_tr, Y_tr, X_va, Y_va, models, n_iters, 
              metric=metrics.accuracy_score, 
              loss=metrics.log_loss):
    iters = list(range(1, n_iters + 1))
    vals = [[[], [], [], []] for _ in models]
    for model in models:
        model.set_params(max_iter=1)
    
    for i in iters:
        for model, (tr_scores, va_scores, tr_loss, va_loss) in zip(models, vals):
            model.fit(X_tr, Y_tr)
            tr_scores.append(metric(Y_tr, model.predict(X_tr)))
            va_scores.append(metric(Y_va, model.predict(X_va)))
            tr_loss.append(loss(Y_tr, model.predict_proba(X_tr)))
            va_loss.append(loss(Y_va, model.predict_proba(X_va)))
    
    return iters, vals

iters_sgd, [vals_sgd] = iter_vals(X_tr, Y_train, X_va, Y_valid, 
                                  [neural_network.MLPClassifier(max_iter=1, warm_start=True,
                                                                solver="sgd",
                                                                hidden_layer_sizes=(60,60))], 
                                   3000)
iters_adam, [vals_adam] = iter_vals(X_tr, Y_train, X_va, Y_valid, 
                                    [neural_network.MLPClassifier(max_iter=1, warm_start=True,
                                                                  solver="adam",
                                                                  hidden_layer_sizes=(60,60))],
                                    800)


In [None]:
# accuracy and loss plots
fig, ax = plt.subplots(2, 2)

tr_score, va_score, tr_loss, va_loss = vals_sgd
ax[0][0].plot(iters_sgd, tr_score)
ax[0][0].plot(iters_sgd, va_score)
ax[0][0].set_xlabel("Epoch")
ax[0][0].set_ylabel("Accuracy")
ax[0][1].plot(iters_sgd, tr_loss)
ax[0][1].plot(iters_sgd, va_loss)
ax[0][1].set_yticks([1.0, 1.5, 2.0, 2.5])
ax[0][1].set_xlabel("Epoch")
ax[0][1].set_ylabel("Loss")

tr_score, va_score, tr_loss, va_loss = vals_adam
ax[1][0].plot(iters[:300], tr_score[:300])
ax[1][0].plot(iters[:300], va_score[:300])
ax[1][0].set_xlabel("Epoch")
ax[1][0].set_ylabel("Accuracy")
ax[1][1].plot(iters[:300], tr_loss[:300])
ax[1][1].plot(iters[:300], va_loss[:300])
ax[1][1].set_xlabel("Epoch")
ax[1][1].set_ylabel("Loss")

fig.legend(["Training Set", "Validation Set"])
fig.tight_layout()

plt.savefig("./figs/loss_acc.png")
plt.show()

In [None]:
# data for below
result__ = fit_grid(
    svm.SVC(kernel="poly", degree=3),
    {"C": [0.01, 0.1, 1, 10, 100, 1000]}
)

C_means = result__.cv_results_["mean_test_score"]
C_params = result__.cv_results_["params"]
print(f"Best: {result__.best_score_:.4} {result__.best_params_}")
    
result__ = fit_grid(
    svm.SVC(kernel="poly", C=10),
    {"degree": list(range(1, 10))}
)

deg_means = result__.cv_results_["mean_test_score"]
deg_params = result__.cv_results_["params"]
print(f"Best: {result__.best_score_:.4} {result__.best_params_}")
    
result__ = fit_grid(
    svm.SVC(degree=3, C=10),
    {"kernel": ["poly", "sigmoid", "rbf"]}
)

ker_means = result__.cv_results_["mean_test_score"]
ker_params = result__.cv_results_["params"]
print(f"Best: {result__.best_score_:.4} {result__.best_params_}")
    
    
result__ = fit_grid(
    svm.SVC(degree=3, C=10, kernel="poly"),
    {"gamma": ["scale", "auto"]}
)

gam_means = result__.cv_results_["mean_test_score"]
gam_params = result__.cv_results_["params"]
print(f"Best: {result__.best_score_:.4} {result__.best_params_}")

    

In [None]:
# svm param plots
fig, ax = plt.subplots(2, 2)

def get_labels(params):
    return [str(list(x.values())[0]) for x in params]

ax[0][0].bar(get_labels(C_params), C_means)
ax[0][0].set_title("C")
ax[0][1].bar(get_labels(deg_params), deg_means)
ax[0][1].set_title("Degree")
ax[1][0].bar(get_labels(ker_params), ker_means)
ax[1][0].set_title("Kernel")
ax[1][1].bar(["scaled", "unscaled"], gam_means)
ax[1][1].set_title(u"\u03b3")

fig.tight_layout()
fig.savefig("./figs/svm_bars.png")
fig.show()

In [None]:
# toying with keras
# kept for reference only

# # 0.37 
# model = keras.models.Sequential([
#     keras.layers.Dense(80, input_shape=(X_tr.shape[1],), activation="relu"),
#     keras.layers.Dense(80, activation="relu"),
#     keras.layers.Dense(len(genres), activation="softmax")
# ])

# # 0.37
# # model = keras.models.Sequential([
# #     keras.layers.Dense(X_tr.shape[1] - (X_tr.shape[1] + len(genres)) // 4, input_shape=(X_tr.shape[1],), activation="relu"),
# #     keras.layers.Dense((X_tr.shape[1] + len(genres)) // 2, activation="relu"),
# #     keras.layers.Dense(X_tr.shape[1] - 3 * (X_tr.shape[1] + len(genres)) // 4, activation="relu"),
# #     keras.layers.Dense(len(genres), activation="softmax")
# # ])

# model.compile(
#     loss='categorical_crossentropy', 
#     optimizer="sgd", 
#     metrics=[
#         "accuracy", 
#         keras.metrics.Precision(name="precision"),
#         keras.metrics.Recall(name="recall")
#     ]
# )

# history = model.fit(X_tr, Y_tr, epochs=500, batch_size=64, validation_data=(X_va, Y_va), verbose=False)

# pred = pd.DataFrame(
#     np.array([genres[np.argmax(x)] for x in model.predict(X_te)]), 
#     index=X_te.index, 
#     columns=["genres"]
# )
# with open("Y_test.csv", "w") as f:
#     f.write(pred.to_csv().replace("\r", ""))


In [None]:
# more keras playing
# kept for reference only

# def build_model(layers=(60,40), 
#                 activation="relu", 
#                 batch_size=64, 
#                 optimizer="adam",
#                 learning_rate=0.01,
#                 dropout=None):
    
#     model = keras.models.Sequential()
    
#     model.add(keras.layers.Dense(layers[0], input_shape=(X_size,), activation=activation))
#     for layer in layers[1:]:
#         if dropout is not None:
#             model.add(keras.layers.Dropout(dropout))
#         model.add(keras.layers.Dense(layer, activation=activation))
#     if dropout is not None:
#         model.add(keras.layers.Dropout(dropout))
#     model.add(keras.layers.Dense(Y_size, activation="softmax"))
    
#     if optimizer == "sgd":
#         optimizer = keras.optimizers.SGD
#     else: 
#         optimizer = keras.optimizers.Adam
#     optimizer = optimizer(learning_rate=learning_rate)
    
#     model.compile(
#         loss="categorical_crossentropy", 
#         optimizer=optimizer, 
#         metrics=[
#             "accuracy", 
#             keras.metrics.Precision(name="precision"),
#             keras.metrics.Recall(name="recall")
#         ]
#     )
    
#     return model

# model = keras.wrappers.scikit_learn.KerasClassifier(build_fn=build_model, verbose=False)

# layers = [
#     (100, 100, 100, 100), (80, 80, 80, 80), (60, 60, 60, 60), (40, 40, 40, 40),
#     (100, 100, 100), (80, 80, 80), (60, 60, 60), (40, 40, 40),
#     (100, 100), (80, 80), (60, 60), (40, 40),
#     (100,), (80,), (60,), (40,),
#     (100, 80, 60, 40), (100, 75, 50), (100, 50)
# ]
# grid_params = (
#     ("layers", layers),
#     ("activation", ["relu", "sigmoid", "tanh"]),
#     ("batch_size", [ 2 ** i for i in range(11) ]),
#     ("optimizer", ["adam", "sgd"]),
#     ("learning_rate", [ 0.1 ** i for i in range(1, 5) ]),
#     ("dropout", [ 0.1 * i for i in range(1, 10) ]),
#     ("epochs", [ i for i in range(25, 50, 5) ])
# )
# grid_params = { p: (p, r) for p, r in grid_params }

# grid = dict([
#     grid_params["epochs"]
# ])

# grid_cv = model_selection.GridSearchCV(
#     estimator=model, 
#     param_grid=grid, 
#     cv=cv,
#     n_jobs=-1
# )

# result = grid_cv.fit(X, Y_)

# means = result.cv_results_["mean_test_score"]
# params = result.cv_results_["params"]
# print(f"Test: {list(grid.keys())}")
# print(f"Best: {result.best_score_:.4} {result.best_params_}")
# for mean, param in zip(means, params):
#     print(f"{mean:.4} - {param}")