In [42]:
import re
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans

In [2]:
data = pd.read_hdf("/srv/smsguru/merged_questions.hdf5")

In [3]:
stemmer = GermanStemmer()

In [4]:
tokenizer = RegexpTokenizer(r"\w+")

In [5]:
stop = set(stopwords.words("german"))

In [6]:
word_regex= re.compile("\D+")

In [7]:
def preprocess_doc(doc, stem=False):
    res = [w for w in tokenizer.tokenize(doc.lower()) if w not in stop and re.fullmatch(word_regex, w)]
    if stem:
        res = [stemmer.stem(w) for w in res]
    return res

## Bag of words

In [23]:
pipeline = Pipeline([("vect", TfidfVectorizer(
                                 strip_accents='unicode', 
                                 tokenizer=preprocess_doc, 
                                 stop_words=stop)), 
                     ("evaluation", linear_model.SGDClassifier())])

def evaluate(pipeline, X, y):
    pred = pipeline.predict(X)
    return accuracy_score(y, pred)

params=dict(
    vect__use_idf=[True, False],
    vect__max_df=[0.1, 0.25, 0.5, 0.85,  1.0],
    vect__ngram_range=[(1,1),(1,2)]
)

gs = GridSearchCV(pipeline, params, scoring=evaluate, verbose=1, n_jobs=2)

word_gs.fit(data.question.iloc[:int(len(data)*0.9)], y=data.parent_id.iloc[:int(len(data)*0.9)])

print("Best model params {}".format(word_gs.best_params_))

print("Best cv score: {}".format(word_gs.best_score_))

pred = word_gs.predict(data.question.iloc[int(len(data)*0.9):])
final_score = accuracy_score(data.parent_id.iloc[int(len(data)*0.9):], pred)
print("Evaluation score: {}".format(final_score) )

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   18.0s
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   23.6s finished


{'vect__ngram_range': (1, 2), 'vect__max_df': 0.5, 'vect__use_idf': True}
0.597125067261


## Bag of characters

In [31]:
vectorizer = TfidfVectorizer(analyzer="char_wb")

In [32]:
vectorizer.fit(data.question)

TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [36]:
pipeline = Pipeline([("vect", TfidfVectorizer(analyzer="char_wb")), 
                     ("evaluation", linear_model.SGDClassifier())])

def evaluate(pipeline, X, y):
    pred = pipeline.predict(X)
    return accuracy_score(y, pred)

params=dict(
    vect__use_idf=[True, False],
    vect__max_df=[0.1, 0.25, 0.5, 0.85,  1.0],
    vect__ngram_range=[(1,i) for i in range(1,10, 2)]
)

char_gs = GridSearchCV(pipeline, params, scoring=evaluate, verbose=1, n_jobs=2)

char_gs.fit(data.question.iloc[:int(len(data)*0.9)], y=data.parent_id.iloc[:int(len(data)*0.9)])

print("Best model params {}".format(char_gs.best_params_))

print("Best cv score: {}".format(char_gs.best_score_))

pred = char_gs.predict(data.question.iloc[int(len(data)*0.9):])
final_score = accuracy_score(data.parent_id.iloc[int(len(data)*0.9):], pred)
print("Evaluation score: {}".format(final_score) )

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 150 out of 150 | elapsed:  6.2min finished


{'vect__ngram_range': (1, 7), 'vect__max_df': 0.5, 'vect__use_idf': True}
0.637712352986


## w2v topic cluster

In [75]:
w2v = Word2Vec.load_word2vec_format("german.model", binary=1)

In [128]:
class TopicCluster(object):
    
    def __init__(self, n_topics=100, stop_words=None, w2v_model=None):
        self.tokenizer= RegexpTokenizer(r"\w+")
        self.model = w2v_model
        self.n_topics= n_topics
        self.clustering = None
        self.points = None
        self.stop_words=stop_words
    
    def fit(self, X, *args, **kwargs):
        self.points = self._get_embedding(X)
        self.clustering = KMeans(n_clusters=self.n_topics).fit(self.points)
    
    def transform(self, X, *args, **kwargs):
        transformed = []
        print(X)
        for doc in X:
            embedded_doc = []
            for w in self._clean_doc(doc):
                try:
                    embedded_doc.append(self.model[w])
                except KeyError:
                    pass
            embedded_doc = np.vstack(embedded_doc)
            transformed.append(" ".join(map(lambda x: "TOPIC_{}".format(x), self.clustering.predict(embedded_doc))))
        return transformed
                               
    def _get_embedding(self, doc):
        docs = list(map(self._clean_doc, doc))
        res = []
        idx = 0
        for doc in docs:
            for w in doc:
                try:
                    res.append(self.model[w])
                    idx += 1
                except KeyError:
                    pass
        return res
    
    def _clean_doc(self, sen, use_stopwords=False):
        res = self.tokenizer.tokenize(sen)
        if use_stopwords and self.stop_words:
            res = [w for w in tokens if w not in self.stop]
        return res

In [None]:
pipeline = Pipeline([
        ("topic", TopicCluster(w2v_model=w2v, stop_words=stop)),
        ("vect", TfidfVectorizer()), 
        ("evaluation", linear_model.SGDClassifier())])

def evaluate(pipeline, X, y):
    pred = pipeline.predict(X)
    return accuracy_score(y, pred)

params=dict(
    topic__n_topics=[25, 50, 100, 250, 500, 1000, 2000],
    vect__use_idf=[True, False],
    vect__max_df=[0.1, 0.25, 0.5, 0.85,  1.0],
    vect__ngram_range=[(1,1),(1,2)]
)

topic_gs = GridSearchCV(pipeline, params, scoring=evaluate, verbose=1, n_jobs=2)

topic_gs.fit(data.question.iloc[:int(len(data)*0.9)], y=data.parent_id.iloc[:int(len(data)*0.9)])

print("Best model params {}".format(topic_gs.best_params_))

print("Best cv score: {}".format(topic_gs.best_score_))

pred = topic_gs.predict(data.question.iloc[int(len(data)*0.9):])
final_score = accuracy_score(data.parent_id.iloc[int(len(data)*0.9):], pred)
print("Evaluation score: {}".format(final_score) )

Fitting 3 folds for each of 140 candidates, totalling 420 fits
