# Example of how you should be using the API

First thing we need to do is assume you have a folder of split csv files. Afterwards we'll load classifiers from MongoDB and use them for classification.

In [1]:
from __private import fs

import pandas as pd

from os import listdir
from classification import dao

First we should locate the data and all of the mini-batchs

In [2]:
data_directory = "./example_data/"
twitter_data = [data_directory+fn for fn in listdir(data_directory)]

In [3]:
for _ in twitter_data: print(_)

./example_data/2015_12_29_05_00_activities.csv.gz
./example_data/2015_12_29_07_00_activities.csv.gz
./example_data/2015_12_29_20_30_activities.csv.gz
./example_data/2015_12_29_21_30_activities.csv.gz


Then we should look at our available classifiers

In [4]:
for _ in fs.list(): print(_)

alcohol|accuracy:0.8143360752056404|f1:0.8192219679633866|type:LogisticRegression
alcohol|accuracy:0.8401880141010576|f1:0.8498896247240618|type:SVC
alcohol|accuracy:0.8425381903642774|f1:0.8562231759656651|type:RandomForestClassifier
first_person_label|accuracy:0.5637860082304527|f1:0.5574430033343769|type:SVC
first_person_label|accuracy:0.5637860082304527|f1:0.5643693591852614|type:LogisticRegression
first_person|accuracy:0.6951871657754011|f1:0.8034482758620688|type:RandomForestClassifier
first_person|accuracy:0.7005347593582888|f1:0.7751004016064257|type:LogisticRegression
first_person|accuracy:0.7032085561497327|f1:0.8062827225130889|type:RandomForestClassifier
first_person|accuracy:0.7112299465240641|f1:0.8021978021978021|type:SVC


The following are the three classifiers we will end up using, primarily for speed and performances

```
alcohol +- first person +- (present) casual
                        +- (future) intention/looking
                        +- (past) reflecting
```

In [5]:
clf_fn_alc = "alcohol|accuracy:0.8143360752056404|f1:0.8192219679633866|type:LogisticRegression"
clf_fn_fpa = "first_person|accuracy:0.7112299465240641|f1:0.8021978021978021|type:SVC"
clf_fn_fpl = "first_person_label|accuracy:0.5637860082304527|f1:0.5643693591852614|type:LogisticRegression"

In [7]:
%%time

clf_alc = dao.ClassifierAccess.get_byfile(clf_fn_alc)
clf_fpa = dao.ClassifierAccess.get_byfile(clf_fn_fpa)
clf_fpl = dao.ClassifierAccess.get_byfile(clf_fn_fpl)

CPU times: user 2.41 s, sys: 2.26 s, total: 4.67 s
Wall time: 27.7 s


How you have all of the classifiers loaded in.

In [13]:
clf_fpl.get_params()

{'clf': LogisticRegression(C=139.67415702201885, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
           solver='liblinear', tol=0.000449897709599141, verbose=0,
           warm_start=None),
 'clf__C': 139.67415702201885,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__max_iter': 100,
 'clf__multi_class': 'ovr',
 'clf__n_jobs': None,
 'clf__penalty': 'l2',
 'clf__random_state': None,
 'clf__solver': 'liblinear',
 'clf__tol': 0.000449897709599141,
 'clf__verbose': 0,
 'clf__warm_start': None,
 'features': FeatureUnion(n_jobs=1,
        transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=False, max_df

These models however may not be trained up with most recent data, this is why we want to then load in all the training data and re train the model.

In [14]:
%%time

from data import DataAccess, LabelGetter

X = DataAccess.get_as_dataframe()
L = LabelGetter(X)

CPU times: user 2.3 s, sys: 1.57 s, total: 3.87 s
Wall time: 10.9 s


also note that we are not going to refit the model for alcohol since we believe its performing good enought. the first person model also takes a whole 10minutes so beware eh?

In [16]:
%%time
clf_fpl.fit(*L.get_first_person_label())

CPU times: user 9.29 s, sys: 3.24 s, total: 12.5 s
Wall time: 17.6 s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...one,
          solver='liblinear', tol=0.000449897709599141, verbose=0,
          warm_start=None))])

In [17]:
%%time
clf_fpa.fit(*L.get_first_person())

CPU times: user 8min 54s, sys: 8.71 s, total: 9min 3s
Wall time: 9min 10s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...probability=True,
  random_state=None, shrinking=True, tol=0.0008753898561476732,
  verbose=False))])

### note: you should maybe save these retraiend models onto disk to save you time in the future.

# Loading some DataFrames to classify

Note that that there is a column for `text` thats all we care about 

In [24]:
df = pd.read_csv(twitter_data[1])
print(len(df))

5759


In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,first_name,id,place,text,time,user_id
0,0,Shawn,681731561901830144,"Joint Base Lewis-McChord, WA",...listening to Stick Talk for the first time....,2015-12-29T07:00:44.000Z,id:twitter.com:99682561
1,1,1Eyewitness,681731560232468480,"Burbank, CA",@debrcarter @3ChicsPolitico @LorettaLynch but ...,2015-12-29T07:00:44.000Z,id:twitter.com:1517261017
2,2,Evette,681731564338806785,"Nashville, TN",middle part 💁🏾 https://t.co/4Cuzip3KJ6,2015-12-29T07:00:45.000Z,id:twitter.com:2710958186
3,3,Guapaholics✨,681731561344122880,"Boston, MA",@frankieclermont @Luisfinessee @desix3__ u kno...,2015-12-29T07:00:44.000Z,id:twitter.com:1190480407
4,4,Im,681731564816879616,"Houston, TX",If I give you a nick name I'm the only person ...,2015-12-29T07:00:45.000Z,id:twitter.com:2227423789


In [26]:
predictions_alc = clf_alc.predict_proba(df)

In [28]:
print(predictions_alc.shape)
predictions_alc

(5759, 2)


array([[ 0.95334503,  0.04665497],
       [ 0.97256387,  0.02743613],
       [ 0.79522246,  0.20477754],
       ..., 
       [ 0.97305573,  0.02694427],
       [ 0.96458816,  0.03541184],
       [ 0.95874137,  0.04125863]])

Notice that predictions is a (n, 2) since we are asking for probabilities.
we only need one dimension in this case.

In [29]:
df["predict_alc"] = predictions_alc[:,1]

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,first_name,id,place,text,time,user_id,predict_alc
0,0,Shawn,681731561901830144,"Joint Base Lewis-McChord, WA",...listening to Stick Talk for the first time....,2015-12-29T07:00:44.000Z,id:twitter.com:99682561,0.046655
1,1,1Eyewitness,681731560232468480,"Burbank, CA",@debrcarter @3ChicsPolitico @LorettaLynch but ...,2015-12-29T07:00:44.000Z,id:twitter.com:1517261017,0.027436
2,2,Evette,681731564338806785,"Nashville, TN",middle part 💁🏾 https://t.co/4Cuzip3KJ6,2015-12-29T07:00:45.000Z,id:twitter.com:2710958186,0.204778
3,3,Guapaholics✨,681731561344122880,"Boston, MA",@frankieclermont @Luisfinessee @desix3__ u kno...,2015-12-29T07:00:44.000Z,id:twitter.com:1190480407,0.046854
4,4,Im,681731564816879616,"Houston, TX",If I give you a nick name I'm the only person ...,2015-12-29T07:00:45.000Z,id:twitter.com:2227423789,0.507233


After we make the probability predictions for any alcohol related topics we then want to predict the conditional then multiple by the marginal to get the marginal probability of being first person.

In [50]:
thres = 0.75
filter_alc = df.predict_alc > thres

0.09480812641083522

In [57]:
df["predict_fpa|alc"] = 0 

predict_fpa = clf_fpa.predict_proba(df[filter_alc])
df.loc[filter_alc, "predict_fpa|alc"] = predict_fpa[:,1]

df["predict_fpa"] = df["predict_alc"] * df["predict_fpa|alc"]

Then we will predict again on the levels of being first person.

In [63]:
predict_fpl = clf_fpl.predict_proba(df[filter_alc])

predict_fpl = pd.DataFrame(
    predict_fpl, 
         columns=[
        "predict_present|fpa", 
        "predict_future|fpa", 
        "predict_past|fpa"],
    index=df[filter_alc].index)

In [76]:
for col in predict_fpl.columns:
    predict_fpl[col.split("|")[0]] = predict_fpl[col] * df[filter_alc]["predict_fpa"]

In [84]:
df = df.join(predict_fpl).fillna(0)

In [88]:
df[df.predict_alc > thres].head()

Unnamed: 0.1,Unnamed: 0,first_name,id,place,text,time,user_id,predict_alc,predict_fpa,predict_fpa|alc,predict_present|fpa,predict_future|fpa,predict_past|fpa,predict_present,predict_future,predict_past
9,9,Ryan,681731566649737218,"Rancho Cucamonga, CA",@achilles314 idk about that man lol thanks though,2015-12-29T07:00:46.000Z,id:twitter.com:2272481959,0.845886,0.446556,0.527915,0.246851,0.533771,0.219378,0.110233,0.238359,0.097965
47,47,Alyssa,681731581053153282,"Harmar, PA",My party at work tonight tipped me a wad of $2...,2015-12-29T07:00:49.000Z,id:twitter.com:366101161,0.839278,0.652919,0.777953,0.033023,0.939057,0.02792,0.021562,0.613128,0.018229
51,51,Kaylarae,681731586476216320,"Santa Clarita, CA",Frantically cleans room because I don't feel v...,2015-12-29T07:00:50.000Z,id:twitter.com:1113362977,0.786318,0.331858,0.422041,0.026238,0.003065,0.970697,0.008707,0.001017,0.322134
59,59,B,681731588502192128,"Brownsburg, IN",girls only say I hate you to the guys that the...,2015-12-29T07:00:51.000Z,id:twitter.com:1158223614,0.756622,0.149693,0.197844,0.058818,0.00919,0.931992,0.008805,0.001376,0.139512
66,66,•Kristina,681731590343409664,"Nuevo, CA",Your babe😍,2015-12-29T07:00:51.000Z,id:twitter.com:4502766632,0.900311,0.240806,0.26747,0.512077,0.069297,0.418625,0.123311,0.016687,0.100808


In [90]:
df.columns

Index(['Unnamed: 0', 'first_name', 'id', 'place', 'text', 'time', 'user_id',
       'predict_alc', 'predict_fpa', 'predict_fpa|alc', 'predict_present|fpa',
       'predict_future|fpa', 'predict_past|fpa', 'predict_present',
       'predict_future', 'predict_past'],
      dtype='object')

### Giving you a function

In [113]:
class PredictionTransformer:
    
    cols = [
        'predict_alc', 
        'predict_fpa', 
        'predict_fpa|alc',
    ]
    
    def __init__(self, clf_alc, clf_fpa, clf_fpl):
        self.clf_alc = clf_alc
        self.clf_fpa = clf_fpa
        self.clf_fpl = clf_fpl
        
    def __call__(self, df, thres=0.75):
        self.df = df
        
        for col in self.cols:
            self.df[col] = 0 
        
        self.thres = thres
        
        self._make_alcohol_predictions()
        self._make_firstperson_predictions()
        self._make_firstpersonlevel_predictions()
        
        return self.df
        
    
    def _make_alcohol_predictions(self):
        predictions_alc = self.clf_alc.predict_proba(self.df)
        self.df["predict_alc"] = predictions_alc[:,1]
    
    def _make_firstperson_predictions(self):
        filter_alc = self.df.predict_alc > self.thres

        # predict only on subset of the data, makes things way faster
        predict_fpa = self.clf_fpa.predict_proba(self.df[filter_alc])
        self.df.loc[filter_alc, "predict_fpa|alc"] = predict_fpa[:,1]

        # compute a marginal using the product rule
        self.df["predict_fpa"] = self.df["predict_alc"] * self.df["predict_fpa|alc"]
        
    def _make_firstpersonlevel_predictions(self):
        filter_alc = self.df.predict_alc > self.thres
        
        # predict only on subset of the data, makes things way faster
        predict_fpl = self.clf_fpl.predict_proba(self.df[filter_alc])

        # convert it to a named dataframe
        predict_fpl = pd.DataFrame(
            predict_fpl, 
                 columns=[
                "predict_present|fpa", 
                "predict_future|fpa", 
                "predict_past|fpa"],
            index=self.df[filter_alc].index)
        
        marginal_firstperson = self.df[filter_alc]["predict_fpa"]
        
        # for each conditional level generate a marginal
        for col in predict_fpl.columns:
            col_marginal = col.split("|")[0]
            predict_fpl[col_marginal] = predict_fpl[col] * marginal_firstperson
            
        self.df = self.df.join(predict_fpl).fillna(0)

In [114]:
clf = PredictionTransformer(clf_alc, clf_fpa, clf_fpl)

In [148]:
labeld_dataframe = clf(pd.read_csv(twitter_data[3]))

In [160]:
labeld_dataframe[
    ["predict_fpa", "predict_alc", "predict_present", "predict_future", "predict_past", "text"]
][(labeld_dataframe.predict_fpa	 > .70) 
  | (labeld_dataframe.predict_present > .60)
  | (labeld_dataframe.predict_past > .60)
  | (labeld_dataframe.predict_future > .60)
 ].sample(10)

Unnamed: 0,predict_fpa,predict_alc,predict_present,predict_future,predict_past,text
98,0.708982,0.911998,0.130708,0.56847,0.009805,I can't wait for the day to be cuddled up on t...
3543,0.656804,0.892279,0.639164,0.011552,0.006089,"@msremmos Nah, Im still happy"
149,0.784311,0.85995,0.099506,0.000288,0.684518,I think I've gained more weight being at home ...
1938,0.70894,0.978766,0.488762,0.207091,0.013087,French Toast &gt;&gt;&gt;
214,0.703618,0.994573,0.003848,0.23374,0.46603,Last semester of hs is coming up and I have ne...
1189,0.754629,0.886594,0.747106,1.4e-05,0.007509,"Thanks, swayz-dog!! Never had a #gbot before #..."
4087,0.628641,0.972433,0.627732,0.000429,0.000481,Drinking a Red Ale by @MarbleBrewery at @marbl...
3323,0.749898,0.981646,0.718458,0.004142,0.027298,Pinky toe 😐😩😩😩
5100,0.754613,0.989356,0.216126,0.529834,0.008653,ive been tryna go home n play xbox allll day
4326,0.715525,0.924749,0.475757,0.087332,0.152437,it's lowkey at the night show


### As you can tell the classifications are not amazing for this small dataset. however this is because i'm only taking some a 10minute slice of the data. I'll be better in practise since this is only 60 tweets out of millions and it will in fact catch more.