In [42]:
import pandas as pd
import numpy as np
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pdb
import re
import pickle
from lemma_tokenizer import LemmaTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib
from sklearn.svm import SVC
from helper_functions import evaluate, get_best_tags
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MultiLabelBinarizer
import networkx as nx

In [43]:
# load data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")

# load binarizer
lb = joblib.load("binarizer.pk")
mlb = joblib.load("new_binarizer.pk")

# load networkx graph
G_tags = nx.read_gpickle("G_tags.gpickle")

In [49]:
params_vectorizer = {"max_features": 2700, 
                         "ngram_range": (1, 2),
                         'tokenizer': LemmaTokenizer(),
                         'lowercase':False}
best_params = {"kernel": "linear", "C":1}
SVM_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(**params_vectorizer)),
        ('clf', OneVsRestClassifier(CalibratedClassifierCV(SVC(**best_params)))),
    ])

SVM_pipeline.fit(X_train['0'], y_train)

y_pred_svm = SVM_pipeline.predict(X_test['0'])
y_pred_proba_svm = SVM_pipeline.predict_proba(X_test['0'])
y_pred_new_svm = get_best_tags(y_pred_svm, y_pred_proba_svm)
score_svm = evaluate(
        y_test,
        y_pred_svm,
        binarizer=lb,
        G_tags=G_tags,
        l_print_errors=False,
        l_deduplication=True)

print('Test score: {0:.2f}'.format(score_svm))

  'precision', 'predicted', average, warn_for)


Test score: 0.41


In [48]:
file_dir = "/Users/pmlee/Documents/CAPGemini_OpenClassroom/OpenClassrooms_Patrick_Lee/Assignment5/question_categorizer/tags_recommender_app/TagsRecommenderApp/static/db/"
filename_vect = file_dir + "vectorizer_lemma2.pk"
filename_SVM = file_dir + "OVR_SVM_model.sav"
pickle.dump(SVM_pipeline.named_steps["tfidf"], open(filename_vect, 'wb'))
pickle.dump(SVM_pipeline.named_steps["clf"], open(filename_SVM, 'wb'))