In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
posts = pd.read_csv('posts_count_me_pol.csv.gz', compression='gzip', error_bad_lines=False)
stopwords = stopwords.words("portuguese")
feature_names = pd.read_csv('feature_names.csv')

posts = posts[posts['me'] > 2]
posts = posts[posts['polarity'] > 2]
posts = posts[posts.contentCount.apply(lambda x: str(x).isnumeric())]
posts = posts.reset_index()
posts.shape

(37746, 20)

In [2]:
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()

# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'diario']
0    1
1    1
Name: class, dtype: int64


In [3]:
posts['class'] = 0
column_idx = posts.columns.get_loc('class')

for idx_corpus in corpus.index:
    postid = corpus.iloc[idx_corpus].postid
    idx_post = posts[posts['postID']==postid].index
    posts.iloc[idx_post,column_idx] = corpus.iloc[idx_corpus]['class']

In [4]:
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             stop_words=stopwords, vocabulary=feature_names['0'].values)
data = vectorizer.fit_transform(posts.content)
data.shape

(37746, 2300)

In [5]:
matrixPD = pd.DataFrame(data.toarray())
matrixPD['class'] = posts['class']
matrixPD = matrixPD[matrixPD['class'] != 0]
print(matrixPD.shape)

label = matrixPD['class'].values
features = matrixPD.drop('class',axis=1).values

(534, 2301)


In [6]:
from sklearn.model_selection import cross_val_score

modelNB = MultinomialNB(alpha=0.01)
modelNB.fit(features, label)
acc = cross_val_score(modelNB, features, label, cv=10, scoring='accuracy').mean()
prec = cross_val_score(modelNB, features, label, cv=10, scoring='precision').mean()
rec = cross_val_score(modelNB, features, label, cv=10, scoring='recall').mean()

print('NB: acc(' + str(acc) 
          + '), prec(' + str(prec)
          + '), rec(' + str(rec) + ')'
         )

NB: acc(0.786727947105), prec(0.785005776291), rec(0.92781512605)


In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import oll

accuracy = []
precision = []
recall = []
kf = KFold(n_splits=2, random_state=None, shuffle=False)

for train_index, test_index in kf.split(features):

    modelCW = oll.oll("CW", C=2)
    
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    modelCW.fit(X_train, y_train)
    predicted = modelCW.predict(X_test)

    accuracy.append(accuracy_score(y_test, predicted))
    precision.append(precision_score(y_test, predicted))
    recall.append(recall_score(y_test, predicted))

#1 - 779,788,901
#2 - 782,789,907
#3 - 780,782,916
#4 - 782,781,921
#5 - 779,778,921
#10- 777,772,930
    
print('CW: acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

CW: acc(0.782771535581), prec(0.789214199029), rec(0.907514450867)


In [8]:
posts['prediction'] = modelCW.predict(data)
posts['confidence'] = modelCW.scores(data)

In [9]:
storyPosts = posts[posts['confidence'] > 0] #positive class
blogs = storyPosts[['posts.csv']].groupby(['posts.csv'])
autores = storyPosts[['authorID']].groupby(['authorID'])
print('Postagens: %i' %len(storyPosts))
print('Blogs: %i' %len(blogs))
print('Autores: %i' %len(autores))

Postagens: 30542
Blogs: 14090
Autores: 14401


In [10]:
storyPosts['blogID'] = storyPosts['posts.csv'].astype(int).astype('str')
groupReflex = storyPosts[['blogID','contentCount']].groupby(['blogID']).agg(['count','mean'])
groupReflex['contentCount'].sort_values('count',ascending=False).head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,count,mean
blogID,Unnamed: 1_level_1,Unnamed: 2_level_1
1423861485554136832,10,403.4
1775307651847132928,10,730.3
1298259038914305792,10,472.6
2171485080582464512,10,511.1
2060068719634800128,10,318.6
2359837956202366464,10,321.8
2421741074400311808,10,227.6
1679079252531209216,10,358.3
1505707847476654336,10,466.6
197663378953815200,10,470.3


In [11]:
len(groupReflex[groupReflex['contentCount']['count'] >= 6])

769

In [12]:
groupReflex[groupReflex['contentCount']['count'] >= 6].sum()

contentCount  count      5215.00000
              mean     321383.39246
dtype: float64

In [26]:
storyBlogsIDs = groupReflex[groupReflex['contentCount']['count'] >= 6].index.values
storyBlogPosts = storyPosts[storyPosts['blogID'].isin(storyBlogsIDs)]
len(storyBlogPosts)

5215

In [27]:
storyBlogPosts.to_csv('story_blog_posts.csv.gz', compression='gzip')