In [1]:
import os
import pandas as pd
import pickle

In [3]:
os.chdir('/Users/tom/PycharmProjects/nyu-twipsy')

In [4]:
class PredictionTransformer:
    cols = [
        'predict_tob',
        'predict_fp',
        'predict_fp|tob',
    ]

    def __init__(self, clf_tob, clf_fp, clf_fpl):
        self.clf_tob = clf_tob
        self.clf_fp = clf_fp
        self.clf_fpl = clf_fpl

    def __call__(self, df, thres=0.2):
        self.df = df

        for col in self.cols:
            self.df[col] = 0

        self.thres = thres

        self._make_tob_predictions()
        self._make_firstperson_predictions()
        self._make_firstpersonlevel_predictions()

        return self.df

    def _make_tob_predictions(self):
        predictions_tob = self.clf_tob.predict_proba(self.df)
        self.df["predict_tob"] = predictions_tob[:,1]

    def _make_firstperson_predictions(self):
        filter_tob = self.df.predict_tob > self.thres

        # predict only on subset of the data, makes things way faster
        predict_fp = self.clf_tob.predict_proba(self.df[filter_tob])
        self.df.loc[filter_tob, "predict_fp|tob"] = predict_fp[:,1]

        # compute a marginal using the product rule
        self.df["predict_fp"] = self.df["predict_tob"] * self.df["predict_fp|tob"]

    def _make_firstpersonlevel_predictions(self):
        filter_tob = self.df.predict_tob > self.thres

        # predict only on subset of the data, makes things way faster
        predict_fpl = self.clf_fpl.predict_proba(self.df[filter_tob])

        # convert it to a named dataframe
        predict_fpl = pd.DataFrame(
            predict_fpl,
                 columns=[
                "predict_present|fp",
                "predict_not_present|fp"],
            index=self.df[filter_tob].index)

        marginal_firstperson = self.df[filter_tob]["predict_fp"]

        # for each conditional level generate a marginal
        for col in predict_fpl.columns:
            col_marginal = col.split("|")[0]
            predict_fpl[col_marginal] = predict_fpl[col] * marginal_firstperson

        self.df = self.df.join(predict_fpl).fillna(0)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import twokenize.twokenize as tokenizer

from pipelines.helpers import ItemGetter

def make_classifier():
    clf = Pipeline([
        ("getter", ItemGetter("text")),
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())])

    clf_params = {
        'clf__C': 200,
        'clf__dual': False,
        'clf__max_iter': 100,
        'clf__multi_class': 'ovr',
        'clf__penalty': 'l2',
        'tfidf__tokenizer':tokenizer.tokenize,
        'tfidf__ngram_range':(1, 3),
        'tfidf__max_features':200000
    }

    clf.set_params(**clf_params)
    return clf

In [6]:
clf_tob = make_classifier()
clf_fp = make_classifier()
clf_fpl = make_classifier()

In [7]:
def fit_clf(clf, X):
    X_tweets = list(X['text'])
    X_labels = list(X['labels'])
    X_pairs = []
    for p in zip(X_labels, X_tweets):
        X_pairs.append(p)
    X_all_training = pd.DataFrame([e[1] for e in X_pairs], columns=['text'])
    y_all_training = [e[0] for e in X_pairs]
    clf.fit(X_all_training, y_all_training)
    return clf

In [8]:
base_dir = '/Users/tom/PycharmProjects/nyu-research/tobacco/training_data/'
X_tob = pd.read_csv(base_dir + 'tob.csv', names = ['labels', 'text'])
clf_tob = fit_clf(clf_tob, X_tob)

X_fp = pd.read_csv(base_dir + 'fp.csv', names = ['labels', 'text'])
clf_fp = fit_clf(clf_fp, X_fp)

X_fpl = pd.read_csv(base_dir + 'fpl.csv', names = ['labels', 'text'])
clf_fpl = fit_clf(clf_fpl, X_fpl)

In [67]:
clf = PredictionTransformer(clf_tob, clf_fp, clf_fpl)

In [71]:
%%time 
test_df = pd.read_csv('/Users/tom/downloads/tweets_323.csv', engine='python')

CPU times: user 1.09 s, sys: 47.1 ms, total: 1.14 s
Wall time: 1.19 s


In [72]:
test_df

Unnamed: 0.1,Unnamed: 0,text,id,created_at,lat,lon,utc_offset,place
0,66736,Reppin that wine n gold till I die 💯💯,610877893003100160,Tue Jun 16 18:33:33 +0000 2015,41.640241,-81.391672,-18000.0,"Mentor, OH"
1,66737,😻👏🏻 http://t.co/As7Kpp8Khl,610877892680134658,Tue Jun 16 18:33:33 +0000 2015,25.702823,-100.337498,,"San Nicolás de los Garza, Nuevo León"
2,66738,I can't stand a social media ass bitch,610877893212770304,Tue Jun 16 18:33:33 +0000 2015,29.239602,-94.880809,,"Galveston, TX"
3,66739,Everyone sleeps on me tbh😴,610877893330214912,Tue Jun 16 18:33:33 +0000 2015,35.433034,-97.424091,-18000.0,"Midwest City, OK"
4,66740,@HannahKucharski so sorry I couldn't make it t...,610877893338603521,Tue Jun 16 18:33:33 +0000 2015,40.959260,-85.336872,,"Fort Wayne, IN"
5,66741,@2lewdcrew @ChickenDig I have yet to regain th...,610877893166657536,Tue Jun 16 18:33:33 +0000 2015,30.115967,-92.158649,,"Lafayette, LA"
6,66742,http://t.co/h3ErwDw4Be,610877893107912706,Tue Jun 16 18:33:33 +0000 2015,28.193916,-82.739856,-14400.0,"Elfers, FL"
7,66743,"Man meeting new people is the best , the peopl...",610877893485404161,Tue Jun 16 18:33:34 +0000 2015,35.255821,-119.172179,,"Bakersfield, CA"
8,66744,@PrimaPuntaNG they connected really well basic...,610877893607038976,Tue Jun 16 18:33:34 +0000 2015,33.704538,-118.668404,-25200.0,"Los Angeles, CA"
9,66745,rachel dolezal is fucking white jesus christ,610877893531537408,Tue Jun 16 18:33:34 +0000 2015,38.801826,-77.119401,-14400.0,"Washington, DC"


In [73]:
%%time
test_prob = clf(test_df)

CPU times: user 23.6 s, sys: 236 ms, total: 23.9 s
Wall time: 25.7 s


In [74]:
test_prob

Unnamed: 0.1,Unnamed: 0,text,id,created_at,lat,lon,utc_offset,place,predict_tob,predict_fp,predict_fp|tob,predict_present|fp,predict_not_present|fp,predict_present,predict_not_present
0,66736,Reppin that wine n gold till I die 💯💯,610877893003100160,Tue Jun 16 18:33:33 +0000 2015,41.640241,-81.391672,-18000.0,"Mentor, OH",0.016856,0.0,0.0,0.0,0.0,0.0,0.0
1,66737,😻👏🏻 http://t.co/As7Kpp8Khl,610877892680134658,Tue Jun 16 18:33:33 +0000 2015,25.702823,-100.337498,0,"San Nicolás de los Garza, Nuevo León",0.003733,0.0,0.0,0.0,0.0,0.0,0.0
2,66738,I can't stand a social media ass bitch,610877893212770304,Tue Jun 16 18:33:33 +0000 2015,29.239602,-94.880809,0,"Galveston, TX",0.000798,0.0,0.0,0.0,0.0,0.0,0.0
3,66739,Everyone sleeps on me tbh😴,610877893330214912,Tue Jun 16 18:33:33 +0000 2015,35.433034,-97.424091,-18000.0,"Midwest City, OK",0.002391,0.0,0.0,0.0,0.0,0.0,0.0
4,66740,@HannahKucharski so sorry I couldn't make it t...,610877893338603521,Tue Jun 16 18:33:33 +0000 2015,40.959260,-85.336872,0,"Fort Wayne, IN",0.000148,0.0,0.0,0.0,0.0,0.0,0.0
5,66741,@2lewdcrew @ChickenDig I have yet to regain th...,610877893166657536,Tue Jun 16 18:33:33 +0000 2015,30.115967,-92.158649,0,"Lafayette, LA",0.000882,0.0,0.0,0.0,0.0,0.0,0.0
6,66742,http://t.co/h3ErwDw4Be,610877893107912706,Tue Jun 16 18:33:33 +0000 2015,28.193916,-82.739856,-14400.0,"Elfers, FL",0.003733,0.0,0.0,0.0,0.0,0.0,0.0
7,66743,"Man meeting new people is the best , the peopl...",610877893485404161,Tue Jun 16 18:33:34 +0000 2015,35.255821,-119.172179,0,"Bakersfield, CA",0.001198,0.0,0.0,0.0,0.0,0.0,0.0
8,66744,@PrimaPuntaNG they connected really well basic...,610877893607038976,Tue Jun 16 18:33:34 +0000 2015,33.704538,-118.668404,-25200.0,"Los Angeles, CA",0.000188,0.0,0.0,0.0,0.0,0.0,0.0
9,66745,rachel dolezal is fucking white jesus christ,610877893531537408,Tue Jun 16 18:33:34 +0000 2015,38.801826,-77.119401,-14400.0,"Washington, DC",0.006724,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
test_prob.to_csv('/Users/tom/documents/323_predict.csv')

In [9]:
def feature_analysis(clf):
    tfidf = clf.steps[1][1]
    logreg = clf.steps[2][1]
    features = tfidf.get_feature_names()
    idf = tfidf.idf_
    coef = logreg.coef_
    weights = [(features[i], coef[0][i]*idf[i]) for i in range(len(coef[0]))]
    logreg_weights = [(features[i], coef[0][i]) for i in range(len(coef[0]))]
    logreg_weights.sort(key=lambda x: -x[1])
    weights.sort(key=lambda x: -x[1])
    
    print('\n High idf*logreg coef: ', weights[:10])
    print('\n Low idf*logreg coef: ', weights[-10:])
    print('\n High logreg coef: ', logreg_weights[:10])
    print('\n Low logreg coef: ', logreg_weights[-10:])
    

In [10]:
feature_analysis(clf_tob)


 High idf*logreg coef:  [('vaping', 221.07978469654725), ('tobacco', 213.39543250756185), ('cigarettes', 210.6227001720776), ('vaped', 193.79017363402068), ('cigs', 193.23111375387083), ('ciggs', 190.00272688755243), ('cigarette', 186.95643786339869), ('cigar', 183.73243390814082), ('juul', 178.71400838878969), ('cig', 174.44548494000259)]

 Low idf*logreg coef:  [('myselfffff', -87.177066626944395), ('r ugly', -88.588690513255315), ('herbal', -90.267910138458575), ('- cigarette', -92.052286262856114), ('vaping is punk', -95.879111089522823), ('i vape', -106.50126456643004), ('company', -117.96334453958787), ('colorado', -124.31917911984799), ('on cigarettes', -166.66269886523389), ('cigarette on', -166.88803882599836)]

 High logreg coef:  [('cigarettes', 41.745570758682163), ('vaping', 36.695491525673113), ('cigarette', 35.606706449032309), ('tobacco', 34.909492235706381), ('smoking', 34.836866606346582), ('cigs', 31.878292731586896), ('cig', 29.801928576956634), ('cigar', 29.343184

In [11]:
feature_analysis(clf_fp)


 High idf*logreg coef:  [('hookah �', 48.1301186888412), ('i love hookah', 37.653869552098584), ('love hookah', 37.653869552098584), ('you', 36.158383883006714), ('"', 33.650208842875877), ('old', 32.791777200311763), ("y'all", 32.657915044673018), ('smoking love', 32.394041122216144), ('smell', 32.268380646523958), ('alone ...', 32.012616386815843)]

 Low idf*logreg coef:  [('cush #merky', -40.307520408503777), ('me', -40.724219029052229), ('i smoke', -41.934083068784169), ('am', -44.97988965185305), ('my', -45.476071268658963), ('i', -46.467673096793291), ('need', -47.65155247543936), ('vaped*', -49.373344252339287), ('im', -49.989522145517341), ("i'm", -54.366807150126391)]

 High logreg coef:  [('you', 9.174995120349557), ('"', 8.2434603632987535), (':', 7.511860783333975), ('hookah �', 6.9901608273191798), ('?', 6.4608381490642284), ('tobacco', 6.2700329139426172), ('people', 5.8462051053160264), ('smell', 5.7647276918722312), ('old', 5.7461343510626168), ('that', 5.5315899633373

In [12]:
feature_analysis(clf_fpl)


 High idf*logreg coef:  [('but i need', 40.265756246635796), ('i smoke', 32.31278906196809), ("i'm tryna go", 30.782093831614425), ('cigarets xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', 30.726790973069466), ('for cigarets xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', 30.726790973069466), ('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', 30.726790973069466), ('vaping', 30.658411459104975), ('smoking', 30.604358774601579), ('love', 30.251731327819623), ('but i', 26.030015779417589)]

 Low idf*logreg coef:  [('vaped', -27.97535885763611), ('was', -28.099744044794214), ('feel like smoking', -28.21050752900825), ('someone', -30.260625607000382), ('bring', -31.763592242701755), ('done smoking', -32.606864016472741), ('vaped*', -34.472525531171144), ('tryna smoke lol', -38.632298561943799), ('smoked', -38.707816532739251), ('cigarette time', -40.600192693285969)]

 High logreg coef:  [('smoking', 12.500210100991215), ('vaping', 6.9644606251414443), ('i smoke', 6.83500

In [13]:
pickle.dump(clf_tob, open("/Users/tom/documents/clf-tob/tob.p", "wb"))
pickle.dump(clf_fp, open("/Users/tom/documents/clf-tob/tob_fp.p", "wb"))
pickle.dump(clf_fpl, open("/Users/tom/documents/clf-tob/tob_fpl.p", "wb"))