In [63]:
import os
import pandas as pd
import pickle

In [5]:
os.chdir('/Users/tom/PycharmProjects/nyu-twipsy')

In [50]:
class PredictionTransformer:
    cols = [
        'predict_tob',
        'predict_fp',
        'predict_fp|tob',
    ]

    def __init__(self, clf_tob, clf_fp, clf_fpl):
        self.clf_tob = clf_tob
        self.clf_fp = clf_fp
        self.clf_fpl = clf_fpl

    def __call__(self, df, thres=0.5):
        self.df = df

        for col in self.cols:
            self.df[col] = 0

        self.thres = thres

        self._make_tob_predictions()
        self._make_firstperson_predictions()
        self._make_firstpersonlevel_predictions()

        return self.df

    def _make_tob_predictions(self):
        predictions_tob = self.clf_tob.predict_proba(self.df)
        self.df["predict_tob"] = predictions_tob[:,1]

    def _make_firstperson_predictions(self):
        filter_tob = self.df.predict_tob > self.thres

        # predict only on subset of the data, makes things way faster
        predict_fp = self.clf_tob.predict_proba(self.df[filter_tob])
        self.df.loc[filter_tob, "predict_fp|tob"] = predict_fp[:,1]

        # compute a marginal using the product rule
        self.df["predict_fp"] = self.df["predict_tob"] * self.df["predict_fp|tob"]

    def _make_firstpersonlevel_predictions(self):
        filter_tob = self.df.predict_tob > self.thres

        # predict only on subset of the data, makes things way faster
        predict_fpl = self.clf_fpl.predict_proba(self.df[filter_tob])

        # convert it to a named dataframe
        predict_fpl = pd.DataFrame(
            predict_fpl,
                 columns=[
                "predict_present|fp",
                "predict_not_present|fp"],
            index=self.df[filter_tob].index)

        marginal_firstperson = self.df[filter_tob]["predict_fp"]

        # for each conditional level generate a marginal
        for col in predict_fpl.columns:
            col_marginal = col.split("|")[0]
            predict_fpl[col_marginal] = predict_fpl[col] * marginal_firstperson

        self.df = self.df.join(predict_fpl).fillna(0)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import twokenize.twokenize as tokenizer

from pipelines.helpers import ItemGetter

def make_classifier():
    clf = Pipeline([
        ("getter", ItemGetter("text")),
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())])

    clf_params = {
        'clf__C': 200,
        'clf__dual': False,
        'clf__max_iter': 100,
        'clf__multi_class': 'ovr',
        'clf__penalty': 'l2',
        'tfidf__tokenizer':tokenizer.tokenize,
        'tfidf__ngram_range':(1, 3),
        'tfidf__max_features':200000
    }

    clf.set_params(**clf_params)
    return clf

In [7]:
clf_tob = make_classifier()
clf_fp = make_classifier()
clf_fpl = make_classifier()

In [22]:
def fit_clf(clf, X):
    X_tweets = list(X['text'])
    X_labels = list(X['labels'])
    X_pairs = []
    for p in zip(X_labels, X_tweets):
        X_pairs.append(p)
    X_all_training = pd.DataFrame([e[1] for e in X_pairs], columns=['text'])
    y_all_training = [e[0] for e in X_pairs]
    clf.fit(X_all_training, y_all_training)
    return clf

In [31]:
X_tob = pd.read_csv('/Users/tom/downloads/MERGED2_normalized_classif1.csv', names = ['labels', 'text'])
clf_tob = fit_clf(clf_tob, X_tob)

X_fp = pd.read_csv('/Users/tom/downloads/MERGED2_normalized_classif2.csv', names = ['labels', 'text'])
clf_fp = fit_clf(clf_fp, X_fp)

X_fpl = pd.read_csv('/Users/tom/downloads/MERGED2_normalized_classif3.csv', names = ['labels', 'text'])
clf_fpl = fit_clf(clf_fp, X_fp)

In [51]:
clf = PredictionTransformer(clf_tob, clf_fp, clf_fpl)

In [55]:
%%time 
test_df = pd.read_csv('/Users/tom/downloads/tweets_323.csv', engine='python')

CPU times: user 933 ms, sys: 21.3 ms, total: 955 ms
Wall time: 976 ms


In [56]:
test_df

Unnamed: 0.1,Unnamed: 0,text,id,created_at,lat,lon,utc_offset,place
0,66736,Reppin that wine n gold till I die 💯💯,610877893003100160,Tue Jun 16 18:33:33 +0000 2015,41.640241,-81.391672,-18000.0,"Mentor, OH"
1,66737,😻👏🏻 http://t.co/As7Kpp8Khl,610877892680134658,Tue Jun 16 18:33:33 +0000 2015,25.702823,-100.337498,,"San Nicolás de los Garza, Nuevo León"
2,66738,I can't stand a social media ass bitch,610877893212770304,Tue Jun 16 18:33:33 +0000 2015,29.239602,-94.880809,,"Galveston, TX"
3,66739,Everyone sleeps on me tbh😴,610877893330214912,Tue Jun 16 18:33:33 +0000 2015,35.433034,-97.424091,-18000.0,"Midwest City, OK"
4,66740,@HannahKucharski so sorry I couldn't make it t...,610877893338603521,Tue Jun 16 18:33:33 +0000 2015,40.959260,-85.336872,,"Fort Wayne, IN"
5,66741,@2lewdcrew @ChickenDig I have yet to regain th...,610877893166657536,Tue Jun 16 18:33:33 +0000 2015,30.115967,-92.158649,,"Lafayette, LA"
6,66742,http://t.co/h3ErwDw4Be,610877893107912706,Tue Jun 16 18:33:33 +0000 2015,28.193916,-82.739856,-14400.0,"Elfers, FL"
7,66743,"Man meeting new people is the best , the peopl...",610877893485404161,Tue Jun 16 18:33:34 +0000 2015,35.255821,-119.172179,,"Bakersfield, CA"
8,66744,@PrimaPuntaNG they connected really well basic...,610877893607038976,Tue Jun 16 18:33:34 +0000 2015,33.704538,-118.668404,-25200.0,"Los Angeles, CA"
9,66745,rachel dolezal is fucking white jesus christ,610877893531537408,Tue Jun 16 18:33:34 +0000 2015,38.801826,-77.119401,-14400.0,"Washington, DC"


In [57]:
%%time
test_prob = clf(test_df)

CPU times: user 20.3 s, sys: 89.7 ms, total: 20.4 s
Wall time: 20.5 s


In [61]:
test_prob.to_csv('/Users/tom/documents/323_predict.csv')

In [64]:
pickle.dump(clf_tob, open("/Users/tom/documents/clf-tob/tob.p", "wb"))
pickle.dump(clf_fp, open("/Users/tom/documents/clf-tob/tob_fp.p", "wb"))
pickle.dump(clf_fpl, open("/Users/tom/documents/clf-tob/tob_fpl.p", "wb"))