In [1]:
import pandas as pd
import numpy as np
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pdb
import re
import pickle
from lemma_tokenizer import LemmaTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from helper_functions import (evaluate, get_best_tags, potential_tags, 
                              topic_name_attribution)
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import entropy
import networkx as nx

In [9]:
# load data
X_train = pd.read_csv("X_train.csv", index_col = "Id")
X_train_all = pd.read_csv("X_train_nmfkl.csv", index_col = "Id")
X_train_nmfkl = X_train_all[X_train_all['0'].notnull()]
X_test = pd.read_csv("X_test.csv", index_col = "Id")
y_all = pd.read_csv("y.csv", index_col = "Id")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

# load binarizer
lb = joblib.load("binarizer.pk")
mlb = joblib.load("new_binarizer.pk")

# file directory
file_dir = "/Users/pmlee/Documents/CAPGemini_OpenClassroom/" + \
           "OpenClassrooms_Patrick_Lee/Assignment5/question_categorizer/" + \
           "tags_recommender_app/TagsRecommenderApp/static/db/"

# load networkx graph
G_tags = nx.read_gpickle("G_tags.gpickle")

# Supervised learning 
## Best model

In [3]:
params_vectorizer = {
    "max_features": 5000,
    "ngram_range": (1, 2),
    'tokenizer': LemmaTokenizer(),
    'lowercase': False
}
best_params = {"kernel": "linear", "C": 0.01}
SVM_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(**params_vectorizer)),
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(SVC(**best_params)))),
])

SVM_pipeline.fit(X_train['0'], y_train)

y_pred_svm = SVM_pipeline.predict(X_test['0'])
y_pred_proba_svm = SVM_pipeline.predict_proba(X_test['0'])
y_pred_new_svm = get_best_tags(y_pred_svm, y_pred_proba_svm)
score_svm = evaluate(
    y_test,
    y_pred_new_svm,
    binarizer=lb,
    G_tags=G_tags,
    l_print_errors=False,
    l_deduplication=True)

print('Test score: {0:.2f}'.format(score_svm))

Test score: 0.57


In [4]:
filename_vect = file_dir + "vectorizer_lemma2.pk"
filename_SVM = file_dir + "OVR_SVM_model.sav"
pickle.dump(SVM_pipeline.named_steps["tfidf"], open(filename_vect, 'wb'))
pickle.dump(SVM_pipeline.named_steps["clf"], open(filename_SVM, 'wb'))

# Unsupervised learning


In [None]:
from sklearn.decomposition import NMF

n_chosen_components = 5200
n_top_words = 20
n_topics = 300

params_vectorizer = {
    "max_features": n_chosen_components,
    "ngram_range": (1, 1),
    'tokenizer': LemmaTokenizer(),
    'lowercase': False
}

best_params_nmf = {
    "n_components": n_topics,
    "beta_loss": "kullback-leibler",
    "solver": 'mu',
    "max_iter": 1000,
    "alpha": .1,
    "l1_ratio": .5
}

NMFKL_pipeline = Pipeline([
    ('count', CountVectorizer(**params_vectorizer)),
    ('clf_nmf', NMF(**best_params_nmf)),
])

#Fit the NMF model
print("Fitting the NMF model (KL divergence) with "
      "tf-idf features, num_topics =%d..." % n_topics)
NMFKL_pipeline.fit(X_train_nmfkl['0'])

count_feature_names = NMFKL_pipeline.named_steps["count"].get_feature_names()

# Label attribution
df_top_words_nmf_kl = pd.DataFrame(
    NMFKL_pipeline.named_steps["clf_nmf"].components_,
    columns=count_feature_names)

tags_keys = lb.classes_
dict_topicnames_nmf_kl, topicnames_nmf_kl = topic_name_attribution(
    df_top_words_nmf_kl, tags_keys)

# Create Document - Topic Matrix
index_topicnames_kl = list(dict_topicnames_nmf_kl.keys())

nmf_kl_output = NMFKL_pipeline.transform(
    X_train_nmfkl['0'])[:, index_topicnames_kl]

# Make the pandas dataframe
df_document_topic_nmf_kl = pd.DataFrame(
    np.round(nmf_kl_output, 2),
    columns=topicnames_nmf_kl,
    index=X_train_nmfkl.index)

dominant_topic_nmf_kl = df_document_topic_nmf_kl.apply(potential_tags, axis=1)

# prepare the predicted labels
y_true = mlb.fit_transform(y_test)
y_pred_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl.loc[X_test.index])

score_nmfkl = evaluate(
    y_true,
    y_pred_nmfkl,
    binarizer=mlb,
    G_tags=G_tags,
    l_print_errors=False,
    l_deduplication=True)

print('Test score: {0:.2f}'.format(score_nfmkl))

y_pred_text_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl)
no_tag_score_nmfkl = no_tag_percentage_score(y_pred_text_nmfkl, mlb)

print('No tag score: {0:.2f}'.format(no_tag_score_nmfkl))

In [14]:
nmf_kl_output = NMFKL_pipeline.transform(
    X_train_nmfkl['0'])[:, index_topicnames_kl]

# Make the pandas dataframe
df_document_topic_nmf_kl = pd.DataFrame(
    np.round(nmf_kl_output, 2),
    columns=topicnames_nmf_kl,
    index=X_train_nmfkl.index)

dominant_topic_nmf_kl = df_document_topic_nmf_kl.apply(potential_tags, axis=1)

# prepare the predicted labels
y_true = mlb.fit_transform(y_all)
y_pred_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl.loc[y_all.index])

score_nmfkl = evaluate(
    y_true,
    y_pred_nmfkl,
    binarizer=mlb,
    G_tags=G_tags,
    l_print_errors=False,
    l_deduplication=True)

print('Test score: {0:.2f}'.format(score_nfmkl))

y_pred_text_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl)
no_tag_score_nmfkl = no_tag_percentage_score(y_pred_text_nmfkl, mlb)

print('No tag score: {0:.2f}'.format(no_tag_score_nmfkl))

KeyError: 0

In [34]:
# prepare the predicted labels
y_supervised_all = y_all[y_all.TAGS_MODIFIED.notnull()]
y_true = mlb.fit_transform(y_supervised_all.TAGS_MODIFIED)
y_pred_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl.loc[y_supervised_all.index])

score_nmfkl = evaluate(
    y_true,
    y_pred_nmfkl,
    binarizer=mlb,
    G_tags=G_tags,
    l_print_errors=False,
    l_deduplication=True)

print('Test score: {0:.2f}'.format(score_nfmkl))

y_pred_text_nmfkl = mlb.fit_transform(dominant_topic_nmf_kl)
no_tag_score_nmfkl = no_tag_percentage_score(y_pred_text_nmfkl, mlb)

print('No tag score: {0:.2f}'.format(no_tag_score_nmfkl))

KeyError: '['

In [None]:
filename_vect_nmfkl = file_dir + "vectorizer_lemma_nmfkl.pk"
filename_nmfkl = file_dir + "NMFKL_model.sav"
filename_topicnames_nmfkl = file_dir + "topicnames.pk"

pickle.dump(NMFKL_pipeline.named_steps["count"], open(filename_vect_nmfkl, 'wb'))
pickle.dump(NMFKL_pipeline.named_steps["clf_nmf"], open(filename_nmfkl, 'wb'))
pickle.dump((dict_topicnames_nmf_kl, topicnames_nmf_kl), open(filename_topicnames_nmfkl, 'wb'))