In [2]:
import pandas as pd
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier


Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
#Lê os dados do disco
data = pd.read_csv('./dataset_twitter.csv', encoding='utf-8', sep=',')

#Tira as linhas com NaN
data = data.dropna() 

data.count()

Unnamed: 0       10658
Text             10658
Classificacao    10658
dtype: int64

In [3]:
data.isnull().any()
#A coluna text possui nan values

Unnamed: 0       False
Text             False
Classificacao    False
dtype: bool

In [6]:
data.shape

(10658, 3)

In [4]:
data[data['Text'].isnull()]

Unnamed: 0.1,Unnamed: 0,Text,Classificacao


In [8]:
tweets = data['Text'].values

tweets

array(['rt ja pensou presentear sua tia querida no amigo secreto com um desses ecoracao https',
       'ja pensou presentear sua tia querida no amigo secreto com um desses https t co ezbwznwwv',
       'rt nam hoje no shopping com o aniversariante jimin i me desmarca disso https t co gpr',
       ..., 'meu nene ta bravo',
       'rt o discurso de na saude e na doenca e td mentira na chance cai fora',
       'parece https t co qrzjfdll'], dtype=object)

In [10]:
y = data['Classificacao'].values
y

array(['Positivo', 'Positivo', 'Positivo', ..., 'Negativo', 'Negativo',
       'Negativo'], dtype=object)

In [11]:
def clean_doc(doc):
    ''' Clean documents '''
    
    tokens = doc.split()
    
    #Regex para extraçao de links
    re_http = re.compile('http\S+')
    
    tokens = [re_http.sub(' ', w) for w in tokens]

    #Regex para filtro de caracteres
    re_puc = re.compile('[%s]' % re.escape(string.punctuation))

    #Remove pontucao
    tokens = [re_puc.sub('', w ) for w in tokens]

    #Remove tokens que nao sao alfabeticos
    tokens = [word for word in tokens if word.isalpha()]

    #Filtra stopwords
    stop_words = set(stopwords.words('portuguese'))
    tokens = [w for w in tokens if not w in stop_words]

    tokens = [word.lower() for word in tokens if len(word) > 1] #deixa tudo minusculo 

    tokens = ' '.join(tokens)

    #tokens = ''.join(tokens)

    return tokens

In [12]:
tokens = [clean_doc(tweet) for tweet in tweets]

In [13]:
print(tokens[0], y[0])

rt ja pensou presentear tia querida amigo secreto desses ecoracao Positivo


In [15]:
#Separando os dados em teste e treinamento
x_train, x_test, y_train, y_test = train_test_split(tokens, y, test_size = 0.25, random_state = 42)

print(x_train[100],'-->', y_train[100])

acredito semana stranger things saudade filhos --> Positivo


In [16]:
#Criando unigrams e bigrams

ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = ngram_vectorizer.fit_transform(x_train[0:10])

In [17]:
ngram_vectorizer.get_feature_names()

['adoram',
 'adoram fazer',
 'aha',
 'aha morram',
 'aleatorias',
 'aleatorias uns',
 'amigas',
 'amigas sempre',
 'amor',
 'amor vida',
 'andar',
 'andar moro',
 'anos',
 'anos colocaram',
 'barata',
 'barata segunda',
 'bendito',
 'bendito onde',
 'bpudr',
 'brasil',
 'chamado',
 'chamado amigas',
 'checagem',
 'checagem limpeza',
 'co',
 'co bpudr',
 'co hcbpb',
 'co jomfury',
 'co khmeplfra',
 'co rsrgesrlzd',
 'co tzturpsdy',
 'co wnrcnef',
 'colocaram',
 'colocaram grupo',
 'compartilhando',
 'compartilhando resultado',
 'corredor',
 'corredor andar',
 'deu',
 'deu presente',
 'echorando',
 'est',
 'est co',
 'facebook',
 'facebook chamado',
 'fazer',
 'fazer trouxa',
 'feira',
 'feira co',
 'fico',
 'fico saber',
 'grupo',
 'grupo facebook',
 'guardar',
 'guardar bendito',
 'hcbpb',
 'hcbpb co',
 'implementei',
 'implementei sistema',
 'jomfury',
 'khmeplfra',
 'limpeza',
 'limpeza corredor',
 'mandamentos',
 'mandamentos guardar',
 'matando',
 'matando saudade',
 'matei',
 'mat

In [18]:
#Construindo a Pipeline com Sklearn
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())])
    #('clf', GradientBoostingClassifier())

In [19]:
#Parametros que serão testados durante o grid_search
parameters = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((2, 3), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': ([1e3, 5e3]),
    #'clf__n_estimators':([50, 100, 150]),
    #'clf__learning_rate':([0.5, 1])
}

In [20]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1)

In [21]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   19.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'vect__ngram_range': ((2, 3), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__C': [1000.0, 5000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [22]:
print(grid_search.best_estimator_)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


In [23]:
y_pred = grid_search.predict(x_test)

In [24]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

   Negativo       0.88      0.84      0.86      1489
   Positivo       0.81      0.85      0.83      1176

avg / total       0.85      0.85      0.85      2665



In [25]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.8457786116322702

In [149]:
#SALVANDO O MODELO PARA UM ARQUIVO PICKLE
joblib.dump(grid_search, 'classificador.pkl', compress = 1)

['classificador.pkl']

In [150]:
tst = joblib.load('classificador.pkl')

In [156]:
tst.predict(['O Lula é um bosta #ladrão'])

array(['Negativo'], dtype=object)