In [1]:
import time
import datetime
import pandas as pd
import utils.functions as f
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', -1)

In [2]:
#processa os dados antes de utilizá-los para gerar o
#modelo ou antes de utilizar o modelo gerado

def clean_tweet(tweet):
    
    cleaned = f.changeLink(tweet)
    cleaned = f.changePatterns(cleaned)
    cleaned = f.removeMention(cleaned)
    cleaned = f.toFixPunctuation(cleaned)
    cleaned = f.remove_punctuation(cleaned)
    cleaned = f.toFixBlankSpaces(cleaned)
    cleaned = f.remove_spaces_between_words(cleaned)
    cleaned = [f.removeSpecialCharactere(word) for word in cleaned.split()]
    cleaned = [word.lower() for word in cleaned 
               if word.isalpha() and len(word) > 1]
    cleaned = f.get_radicals(cleaned)
    cleaned = " ".join(cleaned)
    return cleaned

In [20]:
#funções para verificar o tempo de execução de um código selecionado

def start_at():
    now = datetime.datetime.now()
    return "Start at: {0}:{1}:{2}".format(now.hour, now.minute, now.second)

def end_at():
    now = datetime.datetime.now()
    return "End at: {0}:{1}:{2}".format(now.hour, now.minute, now.second)

def duration(start_at):
    now = datetime.datetime.now()
    return "Duration: {0:6.2f} seconds".format(time.time() - start_time)
    

In [21]:
start_time = time.time()
print(start_at())

df_classified_tweets = f.get_data("classified_auto/all.xlsx")
df_classified_tweets = df_classified_tweets.dropna(subset=['description'])

print(end_at())
print(duration(start_time))

Start at: 16:47:13
End at: 16:47:14
Duration:   1.67 seconds


In [22]:
df_classified_tweets["class"].value_counts()

negative    11007
positive    10271
Name: class, dtype: int64

In [23]:
start_time = time.time()
print(start_at())

df_description_cleaned = df_classified_tweets["description"].apply(clean_tweet)
train_X, test_X, train_y, test_y = train_test_split(
                                    df_description_cleaned,
                                    df_classified_tweets['class'], 
                                    test_size=0.10,
                                    random_state=42,
                                    stratify=df_classified_tweets['class']
                                   )

print(end_at())
print(duration(start_time))

Start at: 16:47:16
End at: 16:48:5
Duration:  49.26 seconds


In [24]:
start_time = time.time()
print(start_at())

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))

print(end_at())
print(duration(start_time))

Start at: 16:48:5
End at: 16:48:5
Duration:   0.01 seconds


In [25]:
start_time = time.time()
print(start_at())

train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

clf = MultinomialNB()
clf.fit(train_X, train_y)

print(end_at())
print(duration(start_time))

print("The accuracy on the test set is %.2f." % (clf.score(test_X, test_y)))
print("The ngram representation had %i features." % (train_X.shape[1]))

Start at: 16:48:5
End at: 16:48:7
Duration:   1.48 seconds
The accuracy on the test set is 0.96.
The ngram representation had 118077 features.


In [26]:
start_time = time.time()
print(start_at())

df_all = f.get_data(
'data/07_repeated_descriptions_deleted/without_repeated_descriptions.xlsx')

print(end_at())
print(duration(start_time))

Start at: 16:48:7
End at: 16:49:5
Duration:  58.26 seconds


In [27]:
df_all.dropna(subset=['description'],inplace=True)

In [28]:
def preditor(x):
    prediction = clf.predict(vectorizer.transform([clean_tweet(x)]))
    return prediction[0]

In [30]:
start_time = time.time()
print(start_at())

y = df_all["description"].apply(preditor)
df_all["predicted"] = y

print(end_at())
print(duration(start_time))

Start at: 17:2:3
End at: 17:17:16
Duration: 913.48 seconds


In [31]:
df_all["predicted"].value_counts()

positive    129959
negative    96813 
Name: predicted, dtype: int64