In [1]:
import ml
import nlp
import json_io
import pickle
from itertools import chain
from dvs import DictVectorizerPartial
import numpy as np

### Process comment

In [None]:
path = ml.JSON_DIR+"reddit/"
sarcastic_path = path+"sarcastic/"
serious_path = path+"serious/"
source = '-reddit-'
features_path = 'features/'
n=10

In [None]:
json_io.processRandomizeJson(sarcastic=True,
                     json_path=sarcastic_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)
json_io.processRandomizeJson(sarcastic=False,
                     json_path=serious_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=nlp.cleanTokensReddit)

### Load set of features

In [None]:
sarcasticFeats = json_io.loadProcessedFeatures(features_path,
                                       source,
                                       sarcastic=True,
                                       n=5,
                                       random=False)
seriousFeats = json_io.loadProcessedFeatures(features_path,
                                     source,
                                     sarcastic=False,
                                     n=3,
                                     random=False)
features = chain(sarcasticFeats, seriousFeats)

### Flatten feature dictionaries, if leaveout is a feature that feature is ommitted

In [None]:
dvp = DictVectorizerPartial()

In [None]:
(X,y) = ml.split_feat(features, 2)

In [None]:
(X,y) = ml.flatten(X,y)

In [None]:
(X,y) = (dvp.partial_fit_transform(X), np.array(list(y)))

In [None]:
pickle.dump(dvp, open('pickled/-reddit-dvp.pickle', 'wb'))
pickle.dump(y, open('pickled/-reddit-y.pickle', 'wb'))
pickle.dump(X, open('pickled/-reddit-X.pickle', 'wb'))

In [2]:
X = pickle.load(open('pickled/-reddit-X.pickle', 'rb'))
y = pickle.load(open('pickled/-reddit-y.pickle', 'rb'))

### Train and test, reports results

In [4]:
from sklearn.naive_bayes import MultinomialNB

results = []
for reduceamount in [0, 2500000, 1000000, 500000, 100000]: #, 50000, 25000, 10000, 7500, 5000, 2500, 1500, 1000, 750, 500, 250, 100, 50, 10, 5]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[MultinomialNB()],
                                    reduce=reduceamount,
                                    splits=2,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-reddit-trained-mnbayes.pickle', 'wb'))
print(results)


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.568462
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.619744

		Training size: 0.05
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.673811
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.691526

		Training size: 0.1
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.700810
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.702373

		Training size: 0.2
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	

  f = msb / msw


Features after reduction: (7453, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.639258
Features before reduction: (7453, 12490143)
Features after reduction: (7453, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.640526

		Training size: 0.05
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.695404
Features before reduction: (37268, 12490143)
Features after reduction: (37268, 2500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.696115

		Training size: 0.1
Features before reduction: (74537, 12490143)
Features after reduction: (74537, 2500000)
Starting to train <class 'sklearn.naive_bay

Features before reduction: (298150, 12490143)
Features after reduction: (298150, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.722906
Features before reduction: (298150, 12490143)
Features after reduction: (298150, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 0	Score:	0.725992

		Training size: 0.6
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.726401
Features before reduction: (447226, 12490143)
Features after reduction: (447226, 500000)
Starting to train <class 'sklearn.naive_bayes.MultinomialNB'>
<class 'sklearn.naive_bayes.MultinomialNB'>	Time: 1	Score:	0.729252

		Training size: 0.8
Features before reduction: (596301, 12490143)
Features after reduction: (596301,

In [5]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-reddit-reduction-trainsize-accuracy-mnbayes.json", old_format=True)

In [6]:
from sklearn.linear_model import LogisticRegression

results = []
for reduceamount in [0, 2500000, 1000000]: #, 50000, 25000, 10000, 7500, 5000, 2500, 1500, 1000, 750, 500, 250, 100, 50, 10, 5]:
    print("\n\t\tReduction: "+str(reduceamount))
    for trainsize in [0.8]:
        print("\n\t\tTraining size: "+str(trainsize))
        results.append((reduceamount,
                       trainsize,
                       ml.trainTest(X,
                                    y,
                                    classifiers=[LogisticRegression(n_jobs=-1)],
                                    reduce=reduceamount,
                                    splits=2,
                                    trainsize=trainsize,
                                    testsize=0.2)))
pickle.dump(results, open('pickled/-reddit-trained-log.pickle', 'wb'))
print(results)    


		Reduction: 0

		Training size: 0.01
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 64	Score:	0.660871
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 65	Score:	0.662199

		Training size: 0.05
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 187	Score:	0.706700
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 186	Score:	0.706814

		Training size: 0.1
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.logistic.LogisticRegression'>	Time: 314	Score:	0.718493
Starting to train <class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.linear_model.lo

KeyboardInterrupt: 

In [None]:
xyz = []
for red, train, res in results:
    acc = [r[2] for r in res]
    xyz.append((red, train, sum(acc)/len(acc)))
json_io.list_to_json(xyz, "-reddit-reduction-trainsize-accuracy-log.json", old_format=True)