In [1]:
from Header import *
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

Here we present an automated recommender.
The first part trains it and the second is the application for production.

# Training data preparation

In [2]:
df = pd.read_pickle("training_df.pkl")

In [3]:
def split_df(df, test_size = 0.2):
    rand_idx = np.random.permutation(df.index)
    test_size_idx = int(test_size*len(rand_idx))
    train_idx, test_idx = rand_idx[test_size_idx:], rand_idx[:test_size_idx]
    return df.iloc[train_idx], df.iloc[test_idx]

df_train, df_test = split_df(df)

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from spacy.lang.en.stop_words import STOP_WORDS
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize(text):
    with nlp.disable_pipes("tagger", "parser"):
        document = nlp(text)
        lemma = [w.lemma_.lower() for w in document]
        return lemma

def removeStopWords(lemmas):
    filtered_sentence =[] 
    for word in lemmas:
        if not word in STOP_WORDS:
            filtered_sentence.append(word) 
    return " ".join(filtered_sentence)
    
def prepare_vector(sentences, labels):
    # X
    lemmas = map(lemmatize, sentences)
    noStop = map(removeStopWords, lemmas)
    vectorizer = CountVectorizer(binary=True, strip_accents='unicode', lowercase=True,
                             stop_words=None, token_pattern='[a-z]{3,}', max_df=0.3)
    # tokens only with 3 or more characters and no numbers
    X = vectorizer.fit_transform(noStop).toarray()
    vectorizer.fixed_vocabulary_=True
    
    # y
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels)
    
    return X, y, vectorizer, encoder


# Model training

In [5]:
from sklearn import linear_model
from sklearn.feature_selection import RFECV

X, y, vectorizer, encoder = prepare_vector(df_train.Sentence, df_train.PositiveLabel)
maxent = linear_model.LogisticRegression(penalty='l2', C=0.3, solver='liblinear')
maxent.fit(X, y)

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

# Application

In [6]:
def rank_positive(sentences, vectorizer, encoder, maxent):
    # Prepare X
    sentences=df_test.Sentence
    lemmas = map(lemmatize, sentences)
    noStop = map(removeStopWords, lemmas)
    X = vectorizer.transform(noStop).toarray()

    # Sort by probability p and exclude p<=0.5
    positive_proba = maxent.predict_proba(X)[:, np.where(encoder.classes_=='Positive')[0][0]]
    sort_idx = np.argsort(positive_proba)[::-1]
    sorted_proba = positive_proba[sort_idx]
    sort_idx_cut = sort_idx[sorted_proba > 0.5]
    X = X[sort_idx_cut]

    # List 3 most important words
    eps=0.1
    important_words_idx = (np.abs(maxent.coef_[0,:]*X)>eps)
    pos_words_coeff = []; neg_words_coeff = []
    for iw in important_words_idx:
        word_list = vectorizer.inverse_transform(iw)[0]
        coeff_list = maxent.coef_[0,iw]
        # Sort by coefficient
        sort_idx = np.argsort(coeff_list)[::-1]
        # Collect positive words
        pos_words_coeff.append([])
        for i, (wl, wc) in enumerate(zip(word_list[sort_idx], coeff_list[sort_idx])):
            if i>=3:
                break
            if wc>0:
                pos_words_coeff[-1].append([ wl, wc ])
        # Collect negative words
        neg_words_coeff.append([])
        for i, (wl, wc) in enumerate(zip(word_list[sort_idx][::-1], coeff_list[sort_idx][::-1])):
            if i>=3:
                break
            if wc<0:
                neg_words_coeff[-1].append([ wl, wc ])
    return pd.DataFrame({'Sentence' : sentences.values[sort_idx_cut],
                       'PositiveProba' : positive_proba[sort_idx_cut],
                       'PositiveWords' : pos_words_coeff,
                       'NegativeWords' : neg_words_coeff
                      })

In [7]:
rp = rank_positive(df_test.Sentence, vectorizer, encoder, maxent)

for i in range(5):
    print("Probability to be positive: {:.3f}".format(rp.PositiveProba.iloc[i]))
    print("Positive words and score: "+", "
          .join(["{} ({:.2f})".format(w, s) for w, s in rp.PositiveWords.iloc[i]]))
    print("Negative words and score: "+", "
          .join(["{} ({:.2f})".format(w, s) for w, s in rp.NegativeWords.iloc[i]]))
    print(rp.Sentence.iloc[i])
    print()

Probability to be positive: 0.896
Positive words and score: clinical (0.37), analysis (0.30), sma (0.30)
Negative words and score: randomize (-0.22), efficacy (-0.15), label (-0.14)
This was supported by favourable safety and efficacy data from the interim analysis of a randomized controlled clinical study, CS4 in later-onset subjects, open-label studies in pre-symptomatic subjects, and subjects with infantile-onset and later-onset SMA, where the attainment of motor milestones in subjects receiving treatment differed from that seen in the natural history of SMA

Probability to be positive: 0.883
Positive words and score: consider (0.38), bioequivalence (0.38), clinical (0.37)
Negative words and score: medicinal (-0.27), chmp (-0.23)
Therefore the absence of a bioequivalence study was considered justified and the CHMP concluded that no clinical data were needed to support the application for Lacosamide Accord 50 mg, 100 mg, 150 mg and 200 mg film-coated tablets as a generic medicinal pr

# Application and check that it works

In [8]:
def rank_positive_check(sentences, vectorizer, encoder, maxent, y):
    # Prepare X
    sentences=df_test.Sentence
    lemmas = map(lemmatize, sentences)
    noStop = map(removeStopWords, lemmas)
    X = vectorizer.transform(noStop).toarray()

    # Sort by probability p and exclude p<=0.5
    positive_proba = maxent.predict_proba(X)[:, np.where(encoder.classes_=='Positive')[0][0]]
    sort_idx = np.argsort(positive_proba)[::-1]
    sorted_proba = positive_proba[sort_idx]
    sort_idx_cut = sort_idx[sorted_proba > 0.5]
    X = X[sort_idx_cut]

    # List 3 most important words
    eps=0.1
    important_words_idx = (np.abs(maxent.coef_[0,:]*X)>eps)
    pos_words_coeff = []; neg_words_coeff = []
    for iw in important_words_idx:
        word_list = vectorizer.inverse_transform(iw)[0]
        coeff_list = maxent.coef_[0,iw]
        # Sort by coefficient
        sort_idx = np.argsort(coeff_list)[::-1]
        # Collect positive words
        pos_words_coeff.append([])
        for i, (wl, wc) in enumerate(zip(word_list[sort_idx], coeff_list[sort_idx])):
            if i>=3:
                break
            if wc>0:
                pos_words_coeff[-1].append([ wl, wc ])
        # Collect negative words
        neg_words_coeff.append([])
        for i, (wl, wc) in enumerate(zip(word_list[sort_idx][::-1], coeff_list[sort_idx][::-1])):
            if i>=3:
                break
            if wc<0:
                neg_words_coeff[-1].append([ wl, wc ])
                
    return pd.DataFrame({'Sentence' : sentences.values[sort_idx_cut],
                       'PositiveProba' : positive_proba[sort_idx_cut],
                       'PositiveWords' : pos_words_coeff,
                       'NegativeWords' : neg_words_coeff,
                       'CorrectLabel' : y.values[sort_idx_cut]
                      })

In [9]:
rp = rank_positive_check(df_test.Sentence, vectorizer, encoder, maxent, df_test.Label)

for i in range(10):
    print("Probability to be positive: {:.3f} and correct label: {}"
          .format(rp.PositiveProba.iloc[i], rp.CorrectLabel.iloc[i]))
    print("Positive words and score: "+", "
          .join(["{} ({:.2f})".format(w, s) for w, s in rp.PositiveWords.iloc[i]]))
    print("Negative words and score: "+", "
          .join(["{} ({:.2f})".format(w, s) for w, s in rp.NegativeWords.iloc[i]]))
    print(rp.Sentence.iloc[i])
    print()

Probability to be positive: 0.896 and correct label: Positive
Positive words and score: clinical (0.37), analysis (0.30), sma (0.30)
Negative words and score: randomize (-0.22), efficacy (-0.15), label (-0.14)
This was supported by favourable safety and efficacy data from the interim analysis of a randomized controlled clinical study, CS4 in later-onset subjects, open-label studies in pre-symptomatic subjects, and subjects with infantile-onset and later-onset SMA, where the attainment of motor milestones in subjects receiving treatment differed from that seen in the natural history of SMA

Probability to be positive: 0.883 and correct label: Positive
Positive words and score: consider (0.38), bioequivalence (0.38), clinical (0.37)
Negative words and score: medicinal (-0.27), chmp (-0.23)
Therefore the absence of a bioequivalence study was considered justified and the CHMP concluded that no clinical data were needed to support the application for Lacosamide Accord 50 mg, 100 mg, 150 mg 