In [0]:
#importações das bibliotecas
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd
from pathlib import Path

In [38]:
#carrega as stopwords pt-BR
stopwords = pd.read_csv('https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/stopwords-pt.txt', header=None)[0].values.tolist()
stopwords

['a',
 'acerca',
 'adeus',
 'agora',
 'ainda',
 'alem',
 'algmas',
 'algo',
 'algumas',
 'alguns',
 'ali',
 'além',
 'ambas',
 'ambos',
 'ano',
 'anos',
 'antes',
 'ao',
 'aonde',
 'aos',
 'apenas',
 'apoio',
 'apontar',
 'apos',
 'após',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aqui',
 'aquilo',
 'as',
 'assim',
 'através',
 'atrás',
 'até',
 'aí',
 'baixo',
 'bastante',
 'bem',
 'boa',
 'boas',
 'bom',
 'bons',
 'breve',
 'cada',
 'caminho',
 'catorze',
 'cedo',
 'cento',
 'certamente',
 'certeza',
 'cima',
 'cinco',
 'coisa',
 'com',
 'como',
 'comprido',
 'conhecido',
 'conselho',
 'contra',
 'contudo',
 'corrente',
 'cuja',
 'cujas',
 'cujo',
 'cujos',
 'custa',
 'cá',
 'da',
 'daquela',
 'daquelas',
 'daquele',
 'daqueles',
 'dar',
 'das',
 'de',
 'debaixo',
 'dela',
 'delas',
 'dele',
 'deles',
 'demais',
 'dentro',
 'depois',
 'desde',
 'desligado',
 'dessa',
 'dessas',
 'desse',
 'desses',
 'desta',
 'destas',
 'deste',
 'destes',
 'deve',
 'devem',
 'deverá',
 'dez',
 

In [0]:
#concatena o título com o texto
def append_title_text(row):
    connector = '. ' if not row['title'].endswith('.') else ' '
    return row['title'].strip() + connector + row['text'].strip()

#carrega o arquivo de dataset  
def load_dataset(filename, train=True):
    raw_df = pd.read_csv(filename)
    raw_df['text'] = raw_df.apply(lambda row: append_title_text(row), axis=1)
    del raw_df['title']
    if train:
        raw_df = raw_df.rename(columns={'category': 'label'})
    else:
        del raw_df['id']
    return raw_df

#salva o arquivo de submissão
def write_predictions(predictions, out_path):
    count = 0
    with open(out_path, mode='w', encoding='utf-8') as out_file:
        print('Saving predictions to %s' % out_path)
        out_file.write('id,category\n')
        idx = 0
        for result in predictions:
            count += 1
            out_file.write(str(idx) + ',' + result + '\n')
            idx += 1
            if count % 100 == 0:
                print('Predicted %d sentences' % count)
    out_file.close()
    print('Finished predicting %d sentences' % count)
    print('Results saved in %s' % Path(out_path).absolute())

#definição do modelo utilizado
def train_svm_model(train_df, dev_df):
    processed_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stopwords)), ('tfidf', TfidfTransformer()),
                                  ('clf-svm', SGDClassifier(loss='hinge', max_iter=2000, tol=1e-5, random_state=42))])
    processed_clf_svm = processed_clf_svm.fit(train_df['text'], train_df['label'])
    return processed_clf_svm

In [40]:
#download do dataset
!wget https://github.com/jacksonsavitraz/kaggle-nlp-competition/raw/master/df.zip
!unzip df.zip

--2019-07-17 13:20:34--  https://github.com/jacksonsavitraz/kaggle-nlp-competition/raw/master/df.zip
Resolving github.com (github.com)... 140.82.118.4
Connecting to github.com (github.com)|140.82.118.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/jacksonsavitraz/kaggle-nlp-competition/master/df.zip [following]
--2019-07-17 13:20:34--  https://raw.githubusercontent.com/jacksonsavitraz/kaggle-nlp-competition/master/df.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15145423 (14M) [application/zip]
Saving to: ‘df.zip.1’


2019-07-17 13:20:35 (194 MB/s) - ‘df.zip.1’ saved [15145423/15145423]

Archive:  df.zip
replace df_train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: df_train.csv 

In [41]:
#carrega o dataset de treinamento
train_df = load_dataset('df_train.csv', train=True)
train_df.head()

Unnamed: 0,text,label
0,Casa da Barra Funda tem clima roceiro e receit...,comida
1,Professores de SP decidem manter greve; grupo ...,educacao
2,"Em segunda edição, concurso paga R$ 35 mil par...",empreendedorsocial
3,Usar maconha por anos não faz tão mal para a s...,equilibrioesaude
4,Baleia-azul percorre 5.200 km e revela a cient...,ciencia


In [42]:
#carrega o dataset de validação
dev_df = load_dataset('df_valid.csv', train=False)
dev_df.head()

Unnamed: 0,text
0,"Vitrine de Dilma, Pronatec terá orçamento 65% ..."
1,"Por direitos autorais e publicidade, 'youtuber..."
2,Rótulos de alimentos terão que alertar sobre l...
3,Sociedade britânica de compositores processa S...
4,"Por Fies, aluna madruga na porta da FMU, mas s..."


In [0]:
#treina o modelo
svm_model = train_svm_model(train_df, dev_df)

In [44]:
#faz a predição
predicted_svm = svm_model.predict(train_df['text'])
print(balanced_accuracy_score(train_df['label'], predicted_svm))

0.9943942884351809


In [45]:
#salva o arquivo de submissão
write_predictions(svm_model.predict(dev_df['text']), 'submission.csv')

Saving predictions to submission.csv
Predicted 100 sentences
Predicted 200 sentences
Predicted 300 sentences
Predicted 400 sentences
Predicted 500 sentences
Predicted 600 sentences
Predicted 700 sentences
Predicted 800 sentences
Predicted 900 sentences
Predicted 1000 sentences
Predicted 1100 sentences
Predicted 1200 sentences
Predicted 1300 sentences
Predicted 1400 sentences
Predicted 1500 sentences
Predicted 1600 sentences
Predicted 1700 sentences
Predicted 1800 sentences
Predicted 1900 sentences
Predicted 2000 sentences
Predicted 2100 sentences
Predicted 2200 sentences
Predicted 2300 sentences
Predicted 2400 sentences
Predicted 2500 sentences
Predicted 2600 sentences
Predicted 2700 sentences
Predicted 2800 sentences
Predicted 2900 sentences
Predicted 3000 sentences
Predicted 3100 sentences
Predicted 3200 sentences
Predicted 3300 sentences
Predicted 3400 sentences
Predicted 3500 sentences
Predicted 3600 sentences
Predicted 3700 sentences
Predicted 3800 sentences
Predicted 3900 sentenc

In [0]:
#baixa o arquivo para submissão no kaggle
from google.colab import files
files.download('submission.csv')