In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import twokenize.twokenize as tokenizer

from pipelines.helpers import ItemGetter

def make_classifier():
    clf = Pipeline([
        ("getter", ItemGetter("text")),
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())])

    clf_params = {
        'clf__C': 200,
        'clf__dual': False,
        'clf__max_iter': 100,
        'clf__multi_class': 'ovr',
        'clf__penalty': 'l2',
        'tfidf__tokenizer':tokenizer.tokenize,
        'tfidf__ngram_range':(1, 3),
        'tfidf__max_features':200000
    }

    clf.set_params(**clf_params)
    return clf

In [1]:
import pickle

In [13]:
clf_alc = pickle.load(open('pickles/clf_alc_UPDATED.p', 'rb'))

In [2]:
clf_alc = make_classifier()

In [2]:
%%time

from data import DataAccess, LabelGetter

X_all = DataAccess.get_as_dataframe()
L = LabelGetter(X_all)

Wall time: 44 s


In [6]:
X_s = X_all.sample(5000)
L_s = LabelGetter(X_s)

In [7]:
X, y = L.get_alcohol()

In [3]:
from sklearn.cross_validation import train_test_split

In [17]:
X, y = L.get_alcohol()

In [18]:
X.shape

(15650, 11)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=23)

In [20]:
clf_alc.fit(X_train, y_train)

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=T...one,
          solver='liblinear', tol=0.000655077907893521, verbose=0,
          warm_start=None))])

In [4]:
from classification.reporting import ClassificationReporting

In [21]:
reporting = ClassificationReporting(clf_alc, X_train, X_test, y_train, y_test, 2)

In [22]:
%%time
report = reporting.set_name("Test Classifier").set_level('alc').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3632
          1       1.00      1.00      1.00      6853

avg / total       1.00      1.00      1.00     10485



f1_score
0.9998540785057639


accuracy_score
0.9998092513113972


confusion_matrix
[[3631, 1], [1, 6852]]




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       0.77      0.71      0.74      1723
          1       0.86      0.89      0.88      3442

avg / total       0.83      0.83      0.83      5165



f1_score
0.876993166287016


accuracy_score
0.8327202323330106


confusion_matrix
[[1221, 502], [362, 3080]]




Wall time: 20.5 s


In [14]:
%%time
report = reporting.set_name("Test Classifier").set_level('alc').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
accuracy_score
0.9998479318734793


classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2308
          1       1.00      1.00      1.00      4268

avg / total       1.00      1.00      1.00      6576



confusion_matrix
[[2307, 1], [0, 4268]]


f1_score
0.9998828628323767




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
accuracy_score
0.6657407407407407


classification_report
             precision    recall  f1-score   support

          0       0.53      0.47      0.50      1151
          1       0.73      0.78      0.75      2089

avg / total       0.66      0.67      0.66      3240



confusion_matrix
[[537, 614], [469, 1620]]


f1_score
0.7494795281054822




Wall time: 8.78 s


In [6]:
clf_fpa = pickle.load(open('pickles/clf_fpa_UPDATED.p', 'rb'))

In [5]:
from classification import dao

In [12]:
import pickle

In [6]:
clf_fpa = make_classifier()

In [15]:
clf_fpa

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...probability=True,
  random_state=None, shrinking=True, tol=0.0008753898561476732,
  verbose=False))])

In [7]:
X, y = L.get_first_person()

In [28]:
X.head()

Unnamed: 0_level_0,created_at,labels,predict,text,user,user.created_at,user.favourites_count,user.followers_count,user.friends_count,user.statuses_count,user.verified
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
556e128ad6dfbb46288111e4,Tue Jun 02 20:31:44 +0000 2015,"{'alcohol_related': 1, 'first_person': 0, 'alc...",0.516649,Beer fans need their @ColumbusBrewing Bodhi. I...,"{'verified': False, 'followers_count': 1006, '...",Mon Oct 06 21:00:38 +0000 2008,806,1006,960,10442,False
556e12e6d6dfbb462881153e,Tue Jun 02 20:33:15 +0000 2015,"{'first_person_level': 0, 'first_person': 1, '...",0.523513,Beer Olympics with my #1 fan and babe😍❤️ #Team...,"{'verified': False, 'followers_count': 563, 'f...",Tue Dec 20 02:46:19 +0000 2011,11662,563,356,13940,False
556e1464d6dfbb4628812330,Tue Jun 02 20:39:37 +0000 2015,"{'alcohol': 1, 'first_person': 0, 'alcohol_rel...",0.502633,Stone Cold use to be the baddest MF in my book...,"{'verified': False, 'followers_count': 703, 'f...",Sun Mar 11 08:22:56 +0000 2012,860,703,684,89573,False
556e15f1d6dfbb4628813236,Tue Jun 02 20:46:14 +0000 2015,"{'alcohol': 1, 'first_person': 0, 'alcohol_rel...",0.535758,Now @iamjohnoliver has to drink a Bud Light Li...,"{'verified': True, 'followers_count': 9414, 'f...",Thu Jan 14 03:03:33 +0000 2010,3473,9414,1486,16435,True
556e1660d6dfbb462881361f,Tue Jun 02 20:48:04 +0000 2015,"{'first_person_level': 0, 'first_person': 1, '...",0.536733,Drinking a Soft Parade by @ShortsBrewing @ Lak...,"{'verified': False, 'followers_count': 543, 'f...",Sun Mar 22 18:14:34 +0000 2009,13256,543,626,20051,False


In [21]:
X.shape

(9816, 11)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=24)

In [9]:
%%time
clf_fpa.fit(X_train, y_train)

Wall time: 9min 55s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('getter', ItemGetter(key='text')), ('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=F...probability=True,
  random_state=None, shrinking=True, tol=0.0008753898561476732,
  verbose=False))])

In [10]:
reporting_fpa = ClassificationReporting(clf_fpa, X_train, X_test, y_train, y_test, 2)

In [16]:
sum(y_test)

2122

In [11]:
%%time
report = reporting_fpa.set_name("Test Classifier").set_level('fpa').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       0.87      0.43      0.58      2341
          1       0.75      0.96      0.85      4235

avg / total       0.80      0.77      0.75      6576



f1_score
0.8464807712242147


accuracy_score
0.774787104622871


confusion_matrix
[[1012, 1329], [152, 4083]]




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       0.63      0.27      0.38      1118
          1       0.70      0.91      0.80      2122

avg / total       0.68      0.69      0.65      3240



f1_score
0.7960607304062373


accuracy_score
0.6932098765432099


confusion_matrix
[[306, 812], [182, 1940]]




Wall time: 3min 53s


In [13]:
%%time
report = reporting_fpa.set_name("Test Classifier").set_level('fpa').create_report(1)



Training Results
~~~~~~~~~~~~~~~~
f1_score
0.9996459341437508


confusion_matrix
[[2338, 3], [0, 4235]]


accuracy_score
0.9995437956204379


classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2341
          1       1.00      1.00      1.00      4235

avg / total       1.00      1.00      1.00      6576





Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
f1_score
0.786350793305803


confusion_matrix
[[448, 670], [313, 1809]]


accuracy_score
0.696604938271605


classification_report
             precision    recall  f1-score   support

          0       0.59      0.40      0.48      1118
          1       0.73      0.85      0.79      2122

avg / total       0.68      0.70      0.68      3240





Wall time: 8.9 s


In [46]:
%%time
report = reporting_fpl.set_name("Test Classifier").set_level('fpl').create_report(1)

Training Results
~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2220
          1       1.00      1.00      1.00      1112
          2       1.00      1.00      1.00       927

avg / total       1.00      1.00      1.00      4259



confusion_matrix
[[2216, 1, 3], [1, 1111, 0], [3, 0, 924]]


f1_score
0.9981216247945527


accuracy_score
0.9981216247945527




Testing Results Results
~~~~~~~~~~~~~~~~~~~~~~~
classification_report
             precision    recall  f1-score   support

          0       0.71      0.74      0.72      1067
          1       0.58      0.60      0.59       564
          2       0.51      0.44      0.47       467

avg / total       0.63      0.64      0.63      2098



confusion_matrix
[[787, 147, 133], [158, 341, 65], [162, 99, 206]]


f1_score
0.6327959100580349


accuracy_score
0.6358436606291706




Wall time: 2.09 s
