In [4]:
from ml import *
from json_io import *
import pickle
from itertools import chain
from dvs import DictVectorizerStreaming

### Process tweets, if n is ommitted it processes all of them, sets lable and saves the processed tweets

In [5]:
path = JSON_DIR+"reddit/"
sarcastic_path = path+"sarcastic/"
serious_path = path+"serious/"
source = '-reddit-'
features_path = 'features/'
n=10

In [3]:
processRandomizeJson(sarcastic=True,
                     json_path=sarcastic_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=cleanTokensReddit)
processRandomizeJson(sarcastic=False,
                     json_path=serious_path,
                     features_path=features_path,
                     source=source,
                     n=n,
                     cleanTokens=cleanTokensReddit)

File sarcastic0.json	time:	0:15:16.696618
Processed 100000 json lines
File sarcastic100000.json	time:	0:14:57.047648
Processed 200000 json lines
File sarcastic200000.json	time:	0:14:41.658604
Processed 300000 json lines
File sarcastic300000.json	time:	0:13:32.233240
Processed 400000 json lines
File sarcastic400000.json	time:	0:13:15.141237
Processed 500000 json lines
File sarcastic500000.json	time:	0:13:06.832235
Processed 600000 json lines
File sarcastic600000.json	time:	0:13:07.965238
Processed 700000 json lines
File sarcastic700000.json	time:	0:06:18.604113
Processed 748124 json lines
File serious0.json	time:	0:13:34.035242
Processed 100000 json lines
File serious100000.json	time:	0:13:17.878239
Processed 200000 json lines
File serious1000000.json	time:	0:13:15.545238
Processed 300000 json lines
File serious1100000.json	time:	0:13:19.632237
Processed 400000 json lines
File serious1200000.json	time:	0:05:04.537094
Processed 438060 json lines
File serious200000.json	time:	0:13:14.2672

### Load random set of features

In [6]:
sarcasticFeats = loadProcessedFeatures(features_path,
                                       source,
                                       sarcastic=True,
                                       n=n,
                                       random=False)
seriousFeats = loadProcessedFeatures(features_path,
                                     source,
                                     sarcastic=False,
                                     n=n,
                                     random=False)
features = chain(sarcasticFeats, seriousFeats)

### Flatten feature dictionaries, if leaveout is a feature that feature is ommitted

In [7]:
def flattenDict(feature):
    d = {}
    for key, value in feature.items():
        if isinstance(value, dict):
            for subkey, subvalue in value.items():
                d[subkey] = subvalue
        else:
            d[key] = value
    return d

In [8]:
def fit(features):
    vocab = {}
    for feats in features:
        startTime = datetime.now()
        for word in flattenDict(feat):
            vocab.setdefault(word, len(vocab))
    feature_names = sorted(vocab, key=vocab.get)
    return vocab, feature_names

In [9]:
def transform(features, vocab={}):
    dtype = np.float32
    indptr = [0]
    indices = []
    X = []
    y = []
    for feats, sarc in features:
        startTime = datetime.now()
        for word, val in flattenDict(feats).items():
            if word in vocab:
                indices.append(vocab[f])
                X.append(dtype(v))
        indptr.append(len(indices))
        y.append(sarc)
        if len(y) % 100000 == 0:
            stopTime = datetime.now()
            print("Time:\t%s" % (stopTime - startTime))
            print("Processed %d vectors"%len(y))
            startTime = datetime.now()
    return sp.csr_matrix((X, indices, indptr), dtype=dtype), np.array(y)

In [10]:
def fit_transform(features, vocab={}):
    dtype = np.float32
    indptr = [0]
    indices = []
    X = []
    y = []
    for feats, sarc in features:
        startTime = datetime.now()
        for word, val in flattenDict(feats).items():
            if fit:
                index = vocab.setdefault(word, len(vocab))
                indices.append(index)
                X.append(dtype(val))
            elif word in vocab:
                indices.append(vocab[f])
                X.append(dtype(v))
        indptr.append(len(indices))
        y.append(sarc)
        if len(y) % 100000 == 0:
            stopTime = datetime.now()
            print("Time:\t%s" % (stopTime - startTime))
            print("Processed %d vectors"%len(y))
            startTime = datetime.now()
    feature_names = sorted(vocab, key=vocab.get)
    return sp.csr_matrix((X, indices, indptr), dtype=dtype), np.array(y), vocab, feature_names

In [11]:
(X,y,vocab, feature_names) = fit_transform(features)

Time:	0:00:00
Processed 100000 vectors
Time:	0:00:00
Processed 200000 vectors
Time:	0:00:00.000979
Processed 300000 vectors
Time:	0:00:00
Processed 400000 vectors
Time:	0:00:00
Processed 500000 vectors
Time:	0:00:00.001000
Processed 600000 vectors
Time:	0:00:00.001000
Processed 700000 vectors
Time:	0:00:00.001000
Processed 800000 vectors
Time:	0:00:00
Processed 900000 vectors
Time:	0:00:00
Processed 1000000 vectors
Time:	0:00:00.001000
Processed 1100000 vectors
Time:	0:00:00.001000
Processed 1200000 vectors
Time:	0:00:00.001001
Processed 1300000 vectors
Time:	0:00:00
Processed 1400000 vectors
Time:	0:00:00.000999
Processed 1500000 vectors
Time:	0:00:00
Processed 1600000 vectors
Time:	0:00:00
Processed 1700000 vectors
Time:	0:00:00
Processed 1800000 vectors
Time:	0:00:00
Processed 1900000 vectors


NameError: name 'sp' is not defined

In [None]:
pickle.dump(vocab, open('pickled/-reddit-vectorizer.pickle', 'wb'))
pickle.dump(y, open('pickled/-reddit-sarcasmVector.pickle', 'wb'))
pickle.dump(X, open('pickled/-reddit-featureVector.pickle', 'wb'))

In [3]:
X = pickle.load(open('pickled/-reddit-featureVector.pickle', 'rb'))
y = pickle.load(open('pickled/-reddit-sarcasmVector.pickle', 'rb'))

### Train and test, reports results

In [4]:
trainTest(X,
          y,
          classifiers=DEFAULT_CLASSIFIERS,
          reduce=0,
          splits=1,
          trainsize=0.2,
          testsize=0.2)

ValueError: setting an array element with a sequence.

### Testing saved classifier with data

In [6]:
listOfClassifiersDV = loadClassifiersDV('pickled/2017-04-25205540579589 2voting.pickle')
(classifier, dv) = listOfClassifiersDV[0]
testSavedClassifier(features, sarcasm, classifier[0], dv)

Score:	0.545430


{'score': 0.54542973180292997}