In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
train_path = 'https://kaggle2.blob.core.windows.net/competitions-data/inclass/6441/linear_train.txt?sv=2015-12-11&sr=b&sig=fTqXqYzEK3FTZ9U8Du7Hxamitpga7%2BZsuQnbm6tFFUw%3D&se=2017-04-12T15%3A51%3A59Z&sp=r'
test_path = 'https://kaggle2.blob.core.windows.net/competitions-data/inclass/6441/linear_test.txt?sv=2015-12-11&sr=b&sig=PCxO95%2Batezu0YUs%2FW97ebebbkJCJQL5UuP80YzPh6Q%3D&se=2017-04-12T15%3A53%3A53Z&sp=r'
train = pd.read_csv(train_path, names=["word", "target"])
test = pd.read_csv(test_path, names=["word"])

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV



In [14]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(2, 5), analyzer='char_wb', lowercase=False, binary=True), 
                         LogisticRegression(penalty='l1', C=0.7))
arr = cross_val_score(pipeline, train.word, train.target, cv=5, scoring='roc_auc')
print arr
print np.mean(arr)

[ 0.54797772  0.34331718  0.246279    0.75904001  0.75530851]
0.530384485591


In [74]:
pipeline.fit(train.word, train.target)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='char_wb', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(2, 5), preprocessor=None, stop_words=Non...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [75]:
roc_auc_score(train.target, pipeline.predict(train.word))

0.78598900348365863

In [76]:
from sklearn.svm import LinearSVC

Посмотрим результат на кросс валидации для разных моделей:

In [77]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(2, 5), analyzer='char_wb', lowercase=False, binary=True), 
                         SGDClassifier())
arr = cross_val_score(pipeline, train.word, train.target, cv=5, scoring='roc_auc')
print arr
print np.mean(arr)

[ 0.57778435  0.41324666  0.34933166  0.78071966  0.71910147]
0.56803676085


In [16]:
from sklearn.cross_validation import StratifiedShuffleSplit

In [17]:
cv = StratifiedShuffleSplit(train.target, n_iter=5, test_size=0.2)

In [20]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 4), analyzer='char_wb', lowercase=False), 
                         LogisticRegression(penalty='l1', C=1))
arr = cross_val_score(pipeline, train.word, train.target, cv=cv, scoring='roc_auc')
print arr
print np.mean(arr)

[ 0.90835669  0.91436489  0.91435337  0.91231753  0.91134158]
0.912146810889


In [19]:
pipeline.get_params().keys()

['countvectorizer__encoding',
 'logisticregression__n_jobs',
 'countvectorizer__decode_error',
 'logisticregression__dual',
 'countvectorizer__input',
 'logisticregression__warm_start',
 'countvectorizer__stop_words',
 'countvectorizer__tokenizer',
 'logisticregression__multi_class',
 'logisticregression',
 'logisticregression__random_state',
 'logisticregression__verbose',
 'countvectorizer__lowercase',
 'countvectorizer__min_df',
 'countvectorizer__strip_accents',
 'logisticregression__class_weight',
 'countvectorizer__preprocessor',
 'logisticregression__intercept_scaling',
 'countvectorizer__binary',
 'countvectorizer__token_pattern',
 'countvectorizer__max_df',
 'logisticregression__max_iter',
 'countvectorizer__max_features',
 'countvectorizer__ngram_range',
 'logisticregression__solver',
 'logisticregression__fit_intercept',
 'countvectorizer__analyzer',
 'countvectorizer',
 'logisticregression__penalty',
 'logisticregression__C',
 'steps',
 'countvectorizer__vocabulary',
 'coun

Подберём параметры count_vectorizer с помощью grid search:

'logisticregression__C':[0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1], 

In [22]:
parameters = {'countvectorizer__min_df':[1, 2, 3, 4, 5],
              'countvectorizer__ngram_range':[(2, 4), (2, 5), (1, 3), (2, 3), (1, 4), (1, 5)]
}

In [23]:
gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, scoring='roc_auc')

In [24]:
gs_clf = gs_clf.fit(train.word[:3000], train.target[:3000])

In [25]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

countvectorizer__min_df: 1
countvectorizer__ngram_range: (2, 5)


In [49]:
pipeline = make_pipeline(CountVectorizer(min_df=2, ngram_range=(2, 5), analyzer='char_wb', lowercase=False, binary=False), 
                         LogisticRegression(penalty='l1', C=1.0))
arr = cross_val_score(pipeline, train.word, train.target, cv=cv, scoring='roc_auc')
print arr
print np.mean(arr)

[ 0.91675139  0.91849583  0.91380319  0.91597399  0.91955182]
0.916915243104


In [50]:
pipeline.fit(train.word, train.target)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(2, 5), preprocessor=None, stop_words=No...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [51]:
pipeline.predict_proba(train.word)[:, 1]

array([ 0.36121852,  0.19138054,  0.20404121, ...,  0.08519476,
        0.00899449,  0.01748734])

In [52]:
roc_auc_score(train.target, pipeline.predict_proba(train.word)[:, 1])

0.97172093612495769

In [53]:
pipeline.predict_proba(test.word)[:, 1]

array([ 0.39627602,  0.21694818,  0.18684353, ...,  0.02677278,
        0.00086459,  0.0006587 ])

In [54]:
pred = pipeline.predict_proba(test.word)[:, 1]
submit = pd.DataFrame()
submit['Id'] = xrange(len(test))
submit['Answer'] = pred
submit.to_csv('submit.csv', index=None)
submit[:5]

Unnamed: 0,Id,Answer
0,0,0.396276
1,1,0.216948
2,2,0.186844
3,3,0.100816
4,4,0.204041


0.83111882725974007