In [1]:
import os

In [2]:
input_filename = os.path.join("data", "twitter", "replicable_python_tweets.json")
classes_filename = os.path.join("data", "twitter", "replicable_python_classes.json")

In [3]:
import json
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
print("Loaded {} tweets".format(len(tweets)))

Loaded 117 tweets


In [4]:
with open(classes_filename) as inf:
    labels = json.load(inf)

In [5]:
n_samples = min(len(tweets), len(labels))

In [6]:
sample_tweets = [t.lower() for t in tweets[:n_samples]]
labels = labels[:n_samples]

In [7]:
import numpy as np
y_true = np.array(labels)

In [8]:
print("{:.1f}% have class 1".format(np.mean(y_true == 1) * 100))

52.1% have class 1


In [9]:
from sklearn.base import TransformerMixin
from nltk import word_tokenize

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [{word: True for word in word_tokenize(document)}
                 for document in X]

In [10]:
from sklearn.feature_extraction import DictVectorizer

In [11]:
from sklearn.naive_bayes import BernoulliNB

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
pipeline = Pipeline([('bag-of-words', NLTKBOW()),
                     ('vectorizer', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())
                     ])
scores = cross_val_score(pipeline, sample_tweets, y_true, cv=10, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.836


In [14]:
scores

array([0.83333333, 0.66666667, 0.92307692, 0.90909091, 0.90909091,
       0.8       , 0.92307692, 0.66666667, 0.72727273, 1.        ])

In [15]:
model = pipeline.fit(tweets, labels)

In [16]:
import joblib
model_filename = os.path.join("data","twitter","model","python_context.pkl")

In [17]:
joblib.dump(model, model_filename)

['data/twitter/model/python_context.pkl']