In [1]:
import pandas as pd
pd.set_option('display.max_rows', 1000)

In [2]:
#TWTS = ['PREZIDENTmluvci', 'stary_mrzout', 'JaromirBosak', 'zufanek', 'tangero', 'zdrojak', 'PavelUngr', 'Tomio_Okamura']
TWTS = ['PREZIDENTmluvci', 'PavelUngr', 'JaromirBosak', 'zufanek']

SUFFIX = '_dump.csv'

datasets = []
for who in TWTS:
    frame = pd.DataFrame.from_csv(who+SUFFIX, encoding='utf-8')
    frame['cnt'] = frame.loc[:, 'fav_cnt'] + frame.loc[:, 'rt_cnt']
    
    best_thr = frame['cnt'].quantile(q=0.75)
    frame['good'] = frame['cnt'] >= best_thr

    frame['who'] = who
    print who, ' good if cnt >=', best_thr, 'total', len(frame)
    

    datasets.append(frame)

PREZIDENTmluvci  good if cnt >= 10.0 total 1145
PavelUngr  good if cnt >= 5.0 total 3227
JaromirBosak  good if cnt >= 22.0 total 3245
zufanek  good if cnt >= 3.0 total 3230


In [3]:
data = pd.concat(datasets)

In [4]:
data['text'].fillna("", inplace=True)
data['text'] = data['who'] +' ' + data['text'].str.replace('\s+', ' ')

In [5]:
data.describe()

Unnamed: 0,fav_cnt,rt_cnt,cnt,good
count,10847.0,10847.0,10847.0,10847
mean,6.751175,28.495437,35.246612,0.2641283
std,18.710005,1364.29328,1364.326409,0.4408883
min,0.0,0.0,0.0,False
25%,0.0,0.0,0.0,0
50%,1.0,0.0,1.0,0
75%,5.0,1.0,7.0,1
max,383.0,123006.0,123006.0,True


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import numpy as np

In [7]:
TEST_SIZE = len(data) // 20
train_data, test_data= train_test_split(data, test_size=TEST_SIZE, random_state=42)
#test_data = data[:N]

In [8]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,5), lowercase=False)
vectorizer.fit(train_data['text'].values)

train_X = vectorizer.transform(train_data['text'].values)
train_Y = train_data['good'].values.astype(np.int)

test_X = vectorizer.transform(test_data['text'].values)
test_Y = test_data['good'].values.astype(np.int)

In [9]:
clsf = SGDClassifier(loss='modified_huber',
                     alpha=1e-6, n_iter=200,
                     class_weight='balanced',
                     learning_rate='constant', eta0=0.001)
clsf.fit(train_X, train_Y)

SGDClassifier(alpha=1e-06, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.001, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='modified_huber', n_iter=200,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

In [10]:
pred_Y = clsf.predict(test_X)
prob_Y = clsf.predict_proba(test_X)[:, 1]
test_data.loc[:, 'pred'] = pd.Series(index=test_data.index, data=pred_Y.astype(np.bool))
test_data.loc[:, 'score'] = pd.Series(index=test_data.index, data=prob_Y)
print 'Acc:', accuracy_score(test_Y, pred_Y)
print 'F1: ', f1_score(test_Y, pred_Y)

for who in test_data['who'].unique():
    sel = test_data['who'] == who
    print who, 'F1: ', f1_score(test_data[sel]['good'], test_data[sel]['pred'])

Acc: 0.813653136531
F1:  0.707246376812
JaromirBosak F1:  0.770833333333
zufanek F1:  0.718446601942
PREZIDENTmluvci F1:  0.777777777778
PavelUngr F1:  0.618181818182


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
test_data

Unnamed: 0_level_0,fav_cnt,rt_cnt,text,cnt,good,who,pred,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
698143220551778304,46,2,JaromirBosak Už byl nejvyšší čas...,48,True,JaromirBosak,True,0.564522
696623677761265664,19,0,"zufanek @stary_mrzout @masinko Martin slyší, M...",19,True,zufanek,False,0.436826
507301936396521472,1,0,"JaromirBosak @jedenpes chtěl jsem napsat, že t...",1,False,JaromirBosak,False,0.0
662272935772688384,1,0,"zufanek @zahn0 drogy, to chce kvalitní otupova...",1,False,zufanek,False,0.368522
704976022828142593,6,1,"PREZIDENTmluvci 1/2 Prvoinstanční soud v ""Kauz...",7,False,PREZIDENTmluvci,False,0.35187
344859291066449921,1,0,zufanek @davidsmehlik metáme od vzteku kotrmel...,1,False,zufanek,False,0.477312
648794158539407360,2,0,PavelUngr 2 poznatky: 1. Když spouštíte #seoux...,2,False,PavelUngr,False,0.343909
24178773011,0,0,zufanek @jakub12 jsem pro!,0,False,zufanek,False,0.0
572089412672233473,21,0,"JaromirBosak Aha, Wolfsburg už vede 5:3, tento...",21,False,JaromirBosak,True,0.634571
483338088035659776,2,0,JaromirBosak @kalous37 @JanKaliba Jak zpíval M...,2,False,JaromirBosak,False,0.214341


In [12]:
import lime
import lime.lime_text
from lime.lime_text import ScikitClassifier, LimeTextExplainer
explainer = LimeTextExplainer(class_names=['avg', 'good'], split_expression='(?u)\W+',)

c = ScikitClassifier(clsf, vectorizer)




In [13]:
%matplotlib inline
tid = 532162079806193664
print test_data.loc[tid, 'text']
exp = explainer.explain_instance(test_data.loc[tid, 'text'], c.predict_proba, num_features=10)
exp.show_in_notebook(text=False)

KeyError: 'the label [532162079806193664] is not in the [index]'

In [None]:
print ''.join(sorted(chars))