In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import oll

corpus = pd.read_csv('corpus.csv.gz', compression='gzip')
stopwords = stopwords.words("portuguese")

In [2]:
#corpus = corpus[corpus['_golden'] == False]
corpus = corpus[corpus['qual_a_melhor_classificao_para_esse_texto:confidence'] == 1]
corpus = corpus.reset_index()
corpus.shape

(534, 23)

In [3]:
# fix labels to binary
def classFit(x):
    if x['qual_a_melhor_classificao_para_esse_texto'] == "diario":
        return 1
    else:
        return -1
    
corpus['class'] = corpus.apply(classFit,axis=1)
target = corpus['class'].values

print(corpus['qual_a_melhor_classificao_para_esse_texto'].values[:2])
print(corpus['class'][:2])

['diario' 'outro']
0    1
1   -1
Name: class, dtype: int64


In [None]:
import nltk.stem
portuguese_stemmer = nltk.stem.RSLPStemmer()
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (portuguese_stemmer.stem(w) for w in analyzer(doc))

In [4]:
## manual 10-fold cross-validation
kf = KFold(n_splits=2, random_state=None, shuffle=False)

In [5]:
for i in range(800,2000,100):
    accuracy = []
    precision = []
    recall = []
    
    data = TfidfVectorizer(max_features=i, strip_accents='unicode', stop_words=stopwords).fit_transform(corpus.content)

    for train_index, test_index in kf.split(data):
        model = oll.oll("CW", C=2)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(str(i) + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

800: acc(0.758426966292), prec(0.752552778975), rec(0.936540327167)
900: acc(0.76404494382), prec(0.755231560892), rec(0.942528351119)
1000: acc(0.765917602996), prec(0.74983692107), rec(0.959488843542)
1100: acc(0.762172284644), prec(0.747248379601), rec(0.956494831566)
1200: acc(0.769662921348), prec(0.750909090909), rec(0.965476867494)
1300: acc(0.769662921348), prec(0.750909090909), rec(0.965476867494)
1400: acc(0.765917602996), prec(0.746596411844), rec(0.96847087947)
1500: acc(0.762172284644), prec(0.743106364566), rec(0.96847087947)
1600: acc(0.756554307116), prec(0.736842105263), rec(0.971264175559)
1700: acc(0.758426966292), prec(0.736329979115), rec(0.976850767738)
1800: acc(0.756554307116), prec(0.734683075294), rec(0.976850767738)
1900: acc(0.760299625468), prec(0.737991266376), rec(0.976850767738)


In [7]:
vectorizer = TfidfVectorizer(max_features=1200, strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

for c in range(1,51,5):
    accuracy = []
    precision = []
    recall = []

    for train_index, test_index in kf.split(data):
        model = oll.oll("CW", C=c)
        
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        predicted = model.predict(X_test)

        accuracy.append(accuracy_score(y_test, predicted))
        precision.append(precision_score(y_test, predicted))
        recall.append(recall_score(y_test, predicted))

    print(str(c) + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

1: acc(0.769662921348), prec(0.755509468745), rec(0.953902251363)
6: acc(0.771535580524), prec(0.753542178542), rec(0.962282139631)
11: acc(0.767790262172), prec(0.753424657534), rec(0.95350081959)
16: acc(0.769662921348), prec(0.755079385028), rec(0.95350081959)
21: acc(0.775280898876), prec(0.760135924392), rec(0.95350081959)
26: acc(0.771535580524), prec(0.760249670063), rec(0.944920215435)
31: acc(0.767790262172), prec(0.759242275358), rec(0.939333623256)
36: acc(0.769662921348), prec(0.763536170974), rec(0.933546315191)
41: acc(0.762172284644), prec(0.760144436922), rec(0.925166426923)
46: acc(0.756554307116), prec(0.757347605225), rec(0.919579834744)


In [16]:
vectorizer = TfidfVectorizer(max_features=1200, strip_accents='unicode', stop_words=stopwords)
data = vectorizer.fit_transform(corpus.content)

accuracy = []
precision = []
recall = []

for train_index, test_index in kf.split(data):

    model = oll.oll("CW", C=2)
    
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)

    accuracy.append(accuracy_score(y_test, predicted))
    precision.append(precision_score(y_test, predicted))
    recall.append(recall_score(y_test, predicted))

print(str(len(vectorizer.get_feature_names())) + ': acc(' + str(np.mean(accuracy)) 
          + '), prec(' + str(np.mean(precision))
          + '), rec(' + str(np.mean(recall)) + ')'
         )

900: acc(0.76404494382), prec(0.755231560892), rec(0.942528351119)


In [19]:
feature_names = vectorizer.get_feature_names()
pd.DataFrame(feature_names).to_csv('feature_names_cw.csv')