In [1]:
import eli5
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, KFold
count_vect = CountVectorizer(max_features=1000, min_df=0.01, max_df=0.4)

Using TensorFlow backend.


In [2]:
categories = ['talk.religion.misc', 'sci.crypt', 'misc.forsale', 'comp.sys.mac.hardware']

In [3]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

In [4]:
twenty_train.data[0]

'From: tcmay@netcom.com (Timothy C. May)\nSubject: Re: Once tapped, your code is no good any more.\nOrganization: NETCOM On-line Communication Services (408 241-9760 guest)\nX-Newsreader: Tin 1.1 PL5\nDistribution: na\nLines: 51\n\nBrad Templeton (brad@clarinet.com) wrote:\n: It occurs to me that if they get a wiretap order on you, and the escrow\n: houses release your code to the cops, your code is now no longer secure.\n: \n: It\'s in the hands of cops, and while I am sure most of the time they are\n: good, their security will not be as good as the escrow houses.\n: \n: \n: What this effectively means is that if they perform a wiretap on you,\n: at the end of the wiretap, they should be obligated to inform you that\n: a tap was performed, and replace (for free) the clipper chip in your\n: cellular phone so that it is once again a code known only to the\n: escrow houses.\n\nGetting the court order to reveal the key *also* makes decipherable\nall *past* conversations (which may be on t

In [5]:
twenty_test.data[0]

"Distribution: world\nFrom: Tony_Sullivan@mcontent.apana.org.au\nOrganization: MacContent BBS, Doncaster, Victoria, Australia\nReturn-Receipt-To: Tony_Sullivan@mcontent.apana.org.au\nSubject: Re: DeskWriter Drivers 3.1 -- How to install ?\nLines: 16\n\nCan someone tell me which of the files that come with DW-3.1 go where\nand for what purpose?  What can be left out, for instance, if\nyou don't want to do background printing?\n\nAs far as I can remember, all you need to do to get your Deskwriter up and\nprinting using the 3.1 driver is to drag the driver itself (either serial or\nappletalk depending on your needs) to the system folder. You don't need the\nfonts or anything else if all you want is straight forward, bare bones, basic\nprinting....I don't have anything else installed and can still print on a\nDeskwriter using sys7.1 and HP driver 3.1\nTony\n***************************************************************************\n The views expressed in this posting those of the individ

In [6]:
X_train = count_vect.fit_transform(twenty_train.data)

In [7]:
X_train

<2135x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 134527 stored elements in Compressed Sparse Row format>

In [8]:
X_test = count_vect.transform(twenty_test.data)

In [9]:
def bestparams(model, grid, folds, data, classes):
    grid_search = GridSearchCV(model, param_grid=grid, cv=folds, scoring='f1_macro')
    grid_search.fit(data, classes) 
    return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [10]:
n_fold = 10
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)

In [11]:
models = [LogisticRegression(), LinearSVC(), SGDClassifier()]

In [12]:
grids = [{'class_weight' : ['balanced', None], 'C': [1, 2], 'max_iter': [100, 150]},
        {'loss' : ['hinge', 'squared_hinge'], 'intercept_scaling' : [1, 2]},
        {'alpha': [0.0001, 0.05, 0.1], 'max_iter': [100, 200]}]

In [13]:
chosenmodels = []

In [14]:
trainscores = []

In [15]:
 for i in range(3):
    best_score, best_params, best_estimator = bestparams(models[i], grids[i], folds, X_train, twenty_train.target)
    print('Best score is {}'.format(best_score))
    print('Best parameters are {}'.format(best_params))
    trainscores.append(best_score)
    chosenmodels.append(best_estimator)

Best score is 0.9354025292320112
Best parameters are {'C': 1, 'class_weight': 'balanced', 'max_iter': 100}




Best score is 0.920787440458526
Best parameters are {'intercept_scaling': 1, 'loss': 'hinge'}
Best score is 0.9468505220519082
Best parameters are {'alpha': 0.05, 'max_iter': 200}


In [16]:
lr = chosenmodels[0].fit(X_train, twenty_train.target)
train_preds_lr = lr.predict(X_train)
test_preds_lr = lr.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_lr, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_lr, average='macro')))

logreg train: 1.0
logreg test: 0.9030155991223628


In [17]:
svc = chosenmodels[1].fit(X_train, twenty_train.target)
train_preds_svc = svc.predict(X_train)
test_preds_svc = svc.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_svc, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_svc, average='macro')))

logreg train: 1.0
logreg test: 0.88654588952225


In [18]:
sgd = chosenmodels[2].fit(X_train, twenty_train.target)
train_preds_sgd = sgd.predict(X_train)
test_preds_sgd = sgd.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_sgd, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_sgd, average='macro')))

logreg train: 0.9815896106223347
logreg test: 0.9183836710615192


In [27]:
def analyze_features(model, n):
    index_to_word = {v:k for k,v in count_vect.vocabulary_.items()}
    eli = eli5.formatters.as_dataframe.explain_weights_df(model)
    compsysmachardware = eli[eli['target']==0]
    miscforsale = eli[eli['target']==1]
    scicrypt = eli[eli['target']==2]
    talkreligionmisc = eli[eli['target']==3]
    print(n, 'most significant compsysmachardware features: ')
    for element in compsysmachardware.feature[:n]:
        if element != '<BIAS>':
            feature = element.strip('x')
            print(index_to_word[int(feature)])
    print(n, 'most significant miscforsale features: ')
    for element in miscforsale.feature[:n]:
        if element != '<BIAS>':
            feature = element.strip('x')
            print(index_to_word[int(feature)])
    print(n, 'most significant scicrypt features: ')
    for element in scicrypt.feature[:n]:
        if element != '<BIAS>':
            feature = element.strip('x')
            print(index_to_word[int(feature)])
    print(n, 'most significant talkreligionmisc features: ')
    for element in talkreligionmisc.feature[:n]:
        if element != '<BIAS>':
            feature = element.strip('x')
            print(index_to_word[int(feature)])

In [28]:
analyze_features(lr, 10)

10 most significant compsysmachardware features: 
mac
apple
quadra
comp
duo
macs
buy
hardware
powerbook
simms
10 most significant miscforsale features: 
sale
forsale
wanted
sell
condition
offer
trade
commercial
interested
10 most significant scicrypt features: 
clipper
encryption
key
security
crypto
netcom
nsa
gtoal
code
chip
10 most significant talkreligionmisc features: 
god
christian
koresh
bible
utexas
frank
religion
virginia
sandvik
morality


In [29]:
analyze_features(svc, 5)

5 most significant compsysmachardware features: 
comp
macs
duo
buy
quadra
5 most significant miscforsale features: 
forsale
sale
sell
wanted
trade
5 most significant scicrypt features: 
nsa
clipper
crypto
security
key
5 most significant talkreligionmisc features: 
god
koresh
christian
utexas
never


In [30]:
analyze_features(sgd, 5)

5 most significant compsysmachardware features: 
mac
apple
quadra
powerbook
thanks
5 most significant miscforsale features: 
sale
wanted
forsale
offer
sell
5 most significant scicrypt features: 
clipper
encryption
security
key
code
5 most significant talkreligionmisc features: 
god
christian
bible
religion
koresh


Каких-то явных ошибок не видно, из странностей можно отметить koresh в talkreligionmisc и thanks в compsysmachardware.

In [67]:
new_count_vect = CountVectorizer(max_features=1000, min_df=0.009, max_df=0.39)

In [68]:
X_train = new_count_vect.fit_transform(twenty_train.data)
X_test = new_count_vect.transform(twenty_test.data)

In [69]:
chosenmodels2 = []

In [70]:
trainscores2 = []

In [71]:
 for i in range(3):
    best_score, best_params, best_estimator = bestparams(models[i], grids[i], folds, X_train, twenty_train.target)
    print('Best score is {}'.format(best_score))
    print('Best parameters are {}'.format(best_params))
    trainscores2.append(best_score)
    chosenmodels2.append(best_estimator)

Best score is 0.9354025292320112
Best parameters are {'C': 1, 'class_weight': 'balanced', 'max_iter': 100}




Best score is 0.9214645940907962
Best parameters are {'intercept_scaling': 2, 'loss': 'hinge'}
Best score is 0.9466082056212308
Best parameters are {'alpha': 0.05, 'max_iter': 100}


In [72]:
lr = chosenmodels[0].fit(X_train, twenty_train.target)
train_preds_lr = lr.predict(X_train)
test_preds_lr = lr.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_lr, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_lr, average='macro')))

logreg train: 1.0
logreg test: 0.9044516043814963


In [73]:
svc = chosenmodels[1].fit(X_train, twenty_train.target)
train_preds_svc = svc.predict(X_train)
test_preds_svc = svc.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_svc, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_svc, average='macro')))

logreg train: 1.0
logreg test: 0.8871886686216324


In [74]:
sgd = chosenmodels[2].fit(X_train, twenty_train.target)
train_preds_sgd = sgd.predict(X_train)
test_preds_sgd = sgd.predict(X_test)
print('logreg train: {}'.format(f1_score(twenty_train.target, train_preds_sgd, average='macro')))
print('logreg test: {}'.format(f1_score(twenty_test.target,test_preds_sgd, average='macro')))

logreg train: 0.9825556241456646
logreg test: 0.9203212954013573


Сильно изменить параметры для улучшения результата не удалось. Но чуть-чуть удалось.