# Writing Control

Here we will use a basic version of active learning to write new points to our collection

In [1]:
from __private import db

import pandas as pd 

import json
import os

In [2]:
cols = {'_id',
 'created_at',
 'labels',
 'predict',
 'random_number',
 'text',
 'user'}

Generate Iterables from the files we need

In [3]:
def clean(obj):
    """
    removes the keys with $ and adds the predict key
    """
    obj["_id"] = obj["_id"]["$oid"]
    del obj["timestamp"]
    
    return { k:v for k,v in obj.items() if k in cols }


def iter_dir_contents(fpath):
    for fn in os.listdir(fpath):
        fn = fpath + fn
        with open(fn, "r") as f:
            for line in f:
                yield clean(json.loads(line))
                
def data_control():
    path = "/Users/JasonLiu/Downloads/research-data/drinking/split/"
    return iter_dir_contents(path)

In [4]:
from itpy import Itpy

collect a handful of tweets

In [5]:
%%time
x = Itpy(data_control()).filter(lambda _: _["random_number"] < 0.1)._

CPU times: user 1min 5s, sys: 6.5 s, total: 1min 12s
Wall time: 1min 13s


convert to dataframe

In [6]:
df = pd.DataFrame(x)

In [16]:
from __private import fs
from classification import dao

In [17]:
fs.list()

['alcohol|accuracy:0.8010471204188482|f1:0.82389289392379|type:SVC',
 'alcohol|accuracy:0.8144269924374636|f1:0.8344577062791905|type:SVC',
 'alcohol|accuracy:0.8237347294938918|f1:0.8478151682571572|type:SVC',
 'alcohol|accuracy:0.8289703315881326|f1:0.8509127789046653|type:SVC',
 'alcohol|accuracy:0.8301337987201861|f1:0.8513238289205702|type:RandomForestClassifier',
 'first_person_label|accuracy:0.52046783625731|f1:0.5009187878453114|type:SVC',
 'first_person_label|accuracy:0.5243664717348928|f1:0.508797109912656|type:RandomForestClassifier',
 'first_person_label|accuracy:0.530214424951267|f1:0.5136580488488517|type:SVC',
 'first_person_label|accuracy:0.5321637426900585|f1:0.5265047952844989|type:LogisticRegression',
 'first_person_label|accuracy:0.543859649122807|f1:0.5166687798266746|type:SVC',
 'first_person_label|accuracy:0.5536062378167641|f1:0.5349297830780407|type:SVC',
 'first_person_label|accuracy:0.5555555555555556|f1:0.53459052797563|type:RandomForestClassifier',
 'first_

In [18]:
%%time
clf = dao.ClassifierAccess.get_byfile('first_person|accuracy:0.6947637292464879|f1:0.7941429801894918|type:SVC')

CPU times: user 360 ms, sys: 191 ms, total: 551 ms
Wall time: 3.76 s


In [19]:
%%time
y = clf.predict_proba(df)

CPU times: user 2min 15s, sys: 1.18 s, total: 2min 17s
Wall time: 2min 16s


In [20]:
%%time
clf2 = dao.ClassifierAccess.get_byfile('alcohol|accuracy:0.8289703315881326|f1:0.8509127789046653|type:SVC')

CPU times: user 39.3 ms, sys: 26.7 ms, total: 66 ms
Wall time: 1.06 s


In [21]:
%%time
y2 = clf2.predict_proba(df)

CPU times: user 15.7 s, sys: 93.9 ms, total: 15.8 s
Wall time: 15.7 s


In [54]:
df["i_first_person"] = y[:,1]

In [55]:
df["i_alcohol"] = y2[:,1]

In [56]:
df["control"] = 1

In [103]:
df_temp = df[
    (df.i_alcohol > .5) &
    (df.i_first_person > .5) &
    (df.text.str.len() > 40)][["_id", "created_at", "text", "user", "control", "i_alcohol", "i_first_person"]]

In [104]:
df_temp

Unnamed: 0,_id,created_at,text,user,control,i_alcohol,i_first_person
6,556ba147d6dfbb336fb8b5e3,Mon Jun 01 00:03:56 +0000 2015,Shut it down? I'm just getting started... (Und...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.558735,0.568471
7,556ba15ed6dfbb336fb8b6a8,Mon Jun 01 00:04:18 +0000 2015,&lt;Creedy&gt; fucking jesus\n&lt;Creedy&gt; i...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.647953,0.630342
9,556ba1afd6dfbb336fb8b9ff,Mon Jun 01 00:05:40 +0000 2015,"I think my cup is getting muddy, oh buddy\nIs ...","{'lang': 'en', 'contributors_enabled': False, ...",1,0.512692,0.841444
12,556ba1ced6dfbb336fb8bb17,Mon Jun 01 00:06:10 +0000 2015,HOLD US TIGHT\nWE'RE GETTING DRUNK TONIGHT #i\...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.989330,0.582232
19,556ba263d6dfbb336fb8c141,Mon Jun 01 00:08:40 +0000 2015,Drinking a Rams Head IPA by @FoDoBrewing @ Bar...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.959078,0.800684
23,556ba2c8d6dfbb336fb8c582,Mon Jun 01 00:10:21 +0000 2015,Drinking a Summerweisse by @BottleLogicBrew at...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.996679,0.622957
28,556ba2f8d6dfbb336fb8c77e,Mon Jun 01 00:11:08 +0000 2015,I'm clearly a few drinks in 😂 #houston #weddin...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.679625,0.527819
32,556ba35fd6dfbb336fb8cb9e,Mon Jun 01 00:12:51 +0000 2015,The last song you wanna hear when you are drin...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.955309,0.640730
39,556ba403d6dfbb336fb8d238,Mon Jun 01 00:15:37 +0000 2015,Drinking a Whitsun Ale by @ArcadiaAles @ Delbr...,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.973367,0.754687
41,556ba434d6dfbb336fb8d42c,Mon Jun 01 00:16:24 +0000 2015,Yes I still want her in a whiskey kinda way!,"{'lang': 'en', 'contributors_enabled': False, ...",1,0.770647,0.754765


In [105]:
batch = (v for k,v in df_temp.T.to_dict().items())

In [106]:
for i in batch:
    try:
        db.insert_one(i)
    except:
        pass

In [49]:
X = df[
    (df.c1 > .5) &
    (df.c1 < 0.7) & 
    (df.c2 < .6)  & 
    (df.c2 > .5) & 
    (df.text.str.len() > 40)]\
.set_index("_id").sort("random_number")[["text", "random_number"]]

In [93]:
import datetime

date = str(datetime.date.today())
candidate = X.sort("random_number").head(500)
key = candidate.random_number.apply(hash).sum()

In [94]:
candidate["text"] = candidate["text"].apply(lambda _: _.encode(encoding='UTF-8'))

In [95]:
candidate[["text", "random_number"]].to_csv(
    "../amt/c_amt|{}|{}.csv".format(date, key),
)

In [97]:
c = 0
for i in candidate.index:
    if db.find_one({"_id":i}):
        c += 1

In [99]:
%%bash

open .