In [58]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
posts = pd.read_csv('posts_count_me_pol.csv.gz', compression='gzip', error_bad_lines=False)
stopwords = stopwords.words("portuguese")
feature_names = pd.read_csv('feature_names.csv')

posts = posts[posts['me'] > 2]
posts = posts[posts['polarity'] > 2]
posts = posts.reset_index()
posts.shape

(37746, 20)

In [59]:
corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus[corpus['_trusted_judgments'] == 3]
corpus = corpus.reset_index()
corpus.shape

(496, 28)

In [60]:
# fix labels to binary
lb = preprocessing.LabelBinarizer(neg_label=1, pos_label=2)
target = lb.fit_transform(corpus['qual_a_melhor_classificao_para_esse_texto'].values)
c, r = target.shape
target = target.reshape(c,)
corpus['class'] = target

In [61]:
corpus[['qual_a_melhor_classificao_para_esse_texto','class']].sample(5)

Unnamed: 0,qual_a_melhor_classificao_para_esse_texto,class
176,outro,2
470,diario,1
83,outro,2
217,outro,2
198,diario,1


In [62]:
posts['class'] = 0
column_idx = posts.columns.get_loc('class')

for idx_corpus in corpus.index:
    postid = corpus.iloc[idx_corpus].postid
    idx_post = posts[posts['postID']==postid].index
    posts.iloc[idx_post,column_idx] = corpus.iloc[idx_corpus]['class']

In [63]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=feature_names['0'].values)
data = vectorizer.fit_transform(posts.content)
data.shape

(37746, 800)

In [64]:
matrixPD = pd.DataFrame(data.toarray())
matrixPD['class'] = posts['class']
matrixPD = matrixPD[matrixPD['class'] > 0]
matrixPD.shape

label = matrixPD['class'].values
features = matrixPD.drop('class',axis=1)

In [65]:
from sklearn.model_selection import cross_val_score
model = MultinomialNB()
model.fit(features, label)
cross_val_score(model, features, label, cv=10, scoring='accuracy').mean()

0.76210324129651863

In [66]:
posts['prediction'] = model.predict(data)

In [67]:
storyPosts = posts[posts['prediction']==1]
blogs = storyPosts[['posts.csv']].groupby(['posts.csv'])
autores = storyPosts[['authorID']].groupby(['authorID'])
print('Postagens: %i' %len(storyPosts))
print('Blogs: %i' %len(blogs))
print('Autores: %i' %len(autores))

Postagens: 33712
Blogs: 15290
Autores: 15637


In [68]:
storyPosts['blogID'] = storyPosts['posts.csv'].astype(int).astype('str')
groupReflex = storyPosts[['blogID','contentCount']].groupby(['blogID']).agg(['count','mean'])
groupReflex['contentCount'].sort_values('count',ascending=False).head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,count,mean
blogID,Unnamed: 1_level_1,Unnamed: 2_level_1
2074759621198619904,10,637.6
2450457143799924224,10,623.3
1435403649444620800,10,674.4
1298259038914305792,10,472.6
1922045236865661952,10,280.1
2421741074400311808,10,227.6
2237344542579056128,10,464.5
1505707847476654336,10,466.6
2060068719634800128,10,318.6
1679079252531209216,10,358.3


In [70]:
len(groupReflex[groupReflex['contentCount']['count'] >= 6])

881

In [75]:
groupReflex[groupReflex['contentCount']['count'] >= 6].sum()

contentCount  count      5985.000000
              mean     371094.764286
dtype: float64