# OC IML Projet 5 : Catégorisez automatiquement des questions

Stack Overflow est un site célèbre de questions-réponses liées au développement informatique
développez *un système de suggestion de tag* pour le site. Celui-ci prendra la forme d’un algorithme de machine learning qui assigne automatiquement plusieurs tags pertinents à une question.


Ce notebook contient : 
- API preparation

## import

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set(color_codes=True, font_scale=1.33)

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 25)

import string
from string import punctuation 

import re

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import scipy.stats as st

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from collections import defaultdict
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn.externals import joblib
from skmultilearn.problem_transform import BinaryRelevance

import time
import datetime

import pickle

import math
# import user module
from my_text_utils import myTokenizer 

[nltk_data] Downloading package punkt to /Users/gregory/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregory/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/gregory/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Definitions

In [97]:
# source paths
PATH_SOURCE_QUESTIONS = '../../data/QueryResults.csv' 
# export path
#PATH_EXPORT_FOLDER = '../../data/'
PATH_EXPORT_FOLDER = 'data/'
# model sup RF
mdlFileName = 'model_RF_tags51_max_depthNone_max_features31_min_samples_split2_n_estimators25.pkl'
# stop words
stopWordsFileName = 'stop_words_sw.pkl'
# count vectorizer
countVectFileName = 'cvect_tags51.pkl'
# tfidf
tfidfFileName = 'tfidf_tags51.pkl'
# MultiLabelBinarizer
mlbFileName = 'mlb_tags51.pkl'
# df Topics tags 
dfTopicsTagsFileName = 'df_topics_tags_top100.pkl'
# mode unsup LDA
mdlUnsupFileName = 'model_LDA__learning_decay0.7_max_iter20_n_components100.pkl'
# count vectorizer unsup
countVectUnsupFileName = "cvect_lda.pkl"
# id question 
id_supOK = 50000005 # mdl sup ok
id_supNOK = 50000074 # mdl sup NOK => unpervised

## Useful functions

In [3]:
# prepare dictionnary of translation to suppress ponctuation
replace_punctuation = str.maketrans(string.punctuation,
                                    ' '*len(string.punctuation))
def cleaning_text(questions_curr):

    # lower case
    questions_curr = ' '.join([w.lower() for w in \
                               nltk.word_tokenize(questions_curr) \
                              if not w.lower() in list(sw)])
    # delete newlines
    questions_curr = re.sub(r'\s+', ' ', questions_curr)
    # delete single quotes
    questions_curr = re.sub(r"\'", " ", questions_curr)
    # delete tags
    questions_curr = re.sub('<[^<]+?>',' ', questions_curr)
    # delete numbers (forming group = word with only numbers 
    # example : delete "123" but not "a123")
    questions_curr = re.sub(r'\b\d+\b','', questions_curr) 
    # delete ponctuation (replace by space)
    questions_curr = questions_curr.translate(replace_punctuation)

    return questions_curr

In [59]:
def find_tags_from_text(text, tf_vectorizer, lda_model, 
                        df_topics_tags, no_max=10):
    '''
    Predict tags from text using tf, lda and tags2topic table
    
    tf vectorizer, lda and table must be input.
    '''
    # clean the text
    text_cleaned = cleaning_text(text)

    # calculate feature from text with tf already fitted
    feat_curr =  tf_vectorizer.transform([text_cleaned])

    # calculate topic distrib with lda model already fitted
    topic_distrib_pred = lda_model.transform(feat_curr)
    
    # find best topic from table df_topics_tags
    return find_tags_from_dtopics(topic_distrib_pred, df_topics_tags, 
                                  no_max=no_max)

def find_tags_from_dtopics(d_topics, df_topics_tags, no_max=10):
    '''
    Find best no_max Tags from Topics by giving Topic number as input.
    (By default no_max = 10)
    Uses table linking Tags & Topics 
    
    inputs : 
    - d_topics : topics distribution from LDA
    - df_topics_tags : table linking Tags to Topics (By default df_topics_tags)
    - no_max : number of best Tags to output
    
    returns the list of no_max Topics numbers (int)
    '''
    
    # multiply topics each columns of df_topics_tags by distrib dtopics vector:
    arr_tags = df_topics_tags.values*d_topics # table (tags(row)*Topics(col))
    # sum each row (by Tags)
    sum_distrib_tags = arr_tags.sum(axis=1) # vector (n Tags)
    # create dataframe to link with tags
    df_sum_tags = pd.DataFrame(data=sum_distrib_tags, columns=["d_sum"], 
                           index=df_topics_tags.index)
    # return no_max Tags with best score
    return list(df_sum_tags.sort_values(by="d_sum", 
                                        ascending=False).head(no_max).index)


## Supervised model

### Compress models for API

In [98]:
# load
myModel = open(PATH_EXPORT_FOLDER + mdlFileName, 'rb')
clf = joblib.load(myModel)

In [5]:
# compress
#joblib.dump(clf, PATH_EXPORT_FOLDER + \
#    'mdl_cmp_RF_tags51_max_depthNone_max_features31_min_samples_split2_n_estimators25.pkl',
#    compress=True)

### Load from disk other useful tools

In [6]:
# CounterVectorizer
tf_vectorizer_sup_1 = joblib.load(PATH_EXPORT_FOLDER + countVectFileName)
# TfidfTransformer 
tfidf_transformer_sup_1 = joblib.load(PATH_EXPORT_FOLDER + tfidfFileName)
# MultiLabelBinarizer
mlb = joblib.load(PATH_EXPORT_FOLDER + mlbFileName)

In [7]:
#myTokenizer.__module__ = 'tagger_app'
#str_save_cvect = PATH_EXPORT_FOLDER + "cvect_tags51_ok.pkl"
#joblib.dump(tf_vectorizer_sup_1, str_save_cvect)
#print("CountVectorizer Saved here: {}".format(str_save_cvect))

### Load stopwords [TODO]

In [8]:
sw = joblib.load(PATH_EXPORT_FOLDER + stopWordsFileName)

### Predict tags

#### Input Test Question

In [9]:
df_quest = pd.read_csv(PATH_SOURCE_QUESTIONS, sep=',')

In [75]:
#id_question = id_supOK
id_question = id_supNOK

quest_text = df_quest[df_quest["Id"] == id_question]["Title"] + " " + \
    df_quest[df_quest["Id"] == id_question]["Body"]
quest_text = quest_text.values[0]
print("Question Text:\n", quest_text)

Question Text:
 freebsd newsyslog.conf.d set archive dir <p>On FreeBSD, I have a file named </p>

<p><em>my_site</em></p>

<p>in</p>

<p><code>/usr/local/etc/newsyslog.conf.d</code></p>

<p>The content i.e. the of file <code>my_site</code> looks like this:</p>

<pre><code>/path/to/site/log/site.access_log 644 7 1048576 * GCZ /var/run/nginx.pid  30
</code></pre>

<p>Now I'd need to know, how I can specify the archive target directory.</p>

<p>I haven't found anything in the docs how to set the archive dir in a config file.</p>



#### Clean Text

In [76]:
quest_text_cleaned = cleaning_text(quest_text)
print("Question cleaned:\n", quest_text_cleaned)

Question cleaned:
 freebsd newsyslog conf d archive dir   freebsd   named       my site              usr local etc newsyslog conf d       content i e     my site   looks          path to site log site access log      gcz  var run nginx pid         d need know   specify archive target directory      n t found anything docs archive dir config file   


#### CounterVectorize

In [77]:
contVectValue = tf_vectorizer_sup_1.transform([quest_text_cleaned])
contVectValue

<1x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

#### TfidfTransform

In [78]:
tfIdfValue = tfidf_transformer_sup_1.transform(contVectValue)
tfIdfValue

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

#### Predict

In [80]:
encoded_y_pred = clf.predict(tfIdfValue)
encoded_y_pred

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]])

#### Decode tags

In [81]:
tu_tags = mlb.inverse_transform(encoded_y_pred)

#### Check if tags found

In [82]:
flag_tag_found = len(tu_tags[0]) > 0
flag_tag_found

False

#### If not, use unsupervised model

In [83]:
if flag_tag_found == False:
    # load
    df_topics_tags = joblib.load(PATH_EXPORT_FOLDER + dfTopicsTagsFileName)
    # load model
    myModelUnsup = open(PATH_EXPORT_FOLDER + mdlUnsupFileName, 'rb')
    model_lda = joblib.load(myModelUnsup)
    # load count vect
    tf_vectorizer_1 = joblib.load(PATH_EXPORT_FOLDER + countVectUnsupFileName)
    # predict
    tu_tags[0] = find_tags_from_text(text=quest_text, 
                    tf_vectorizer=tf_vectorizer_1,
                    lda_model = model_lda, df_topics_tags=df_topics_tags,
                    no_max=4)
    

#### Display

In [84]:
str_out=""
for item in tu_tags[0]:
    str_out += item + " "
str_out

'python javascript java php '

In [None]:
# test
# test 2