In [None]:
import os
import tweepy
import pandas as pd
import json
from textblob import TextBlob
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

consumer_key= 'xxxxxxxx'
consumer_secret= 'xxxxx'
access_token= 'xxxxx'
access_token_secret= 'xxxxx'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True)
#Obtener informacion de un usario
data = api.me()
    
    

In [None]:
# Authenticate to Twitter
try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

In [None]:
search_results = api.search(q="Olympic Games", count=1000)

In [None]:
def analyze_sentiment(tweet):
        analysis = TextBlob(clean_tweet(tweet))
        
        if analysis.sentiment.polarity > 0:
            return "positive"
        elif analysis.sentiment.polarity == 0:
            return "neutral"
        else:
            return "negative"


def tweets_to_data_frame(tweets):
        df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])

        df['id'] = np.array([tweet.id for tweet in tweets])
        df['len'] = np.array([len(tweet.text) for tweet in tweets])
        df['date'] = np.array([tweet.created_at for tweet in tweets])
        df['source'] = np.array([tweet.source for tweet in tweets])
        df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])

        return df
def clean_tweet(tweet):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ", tweet).split())
def lower_tweet(tweet):
      return tweet.lower()   
twi=tweets_to_data_frame(search_results)
twi['Tweets']=twi['Tweets'].apply(clean_tweet) 
twi['sentiment'] = np.array([analyze_sentiment(tweet) for tweet in twi['Tweets']])
twi['Tweets']=twi['Tweets'].apply(lower_tweet)
twi.head(10)



In [None]:
class multinomialNB:
  def __init__(self, X_train, y_train,X_test):
    self.D=X_train
    self.C=y_train
    self.test=X_test
    # compute the priors
    # convert the character class to numbers (easier to work with)
  def fit(self):
    le = LabelEncoder()
    y = le.fit_transform(self.C)
    priors = np.bincount(y) / y.shape[0]
    class_type = np.unique(y)
    class_nums = class_type.shape[0]
    feature_nums = self.D.shape[1]
    likelihood = np.zeros((class_nums, feature_nums))

    # compute the word likelihood p(w_t∣C)
    # apply lapace smoothing
    for index, output in enumerate(class_type):
        subset = X_train_dtm[np.equal(y, output)]
        likelihood[index, :] = (np.sum(subset, axis = 0) + 1) / (np.sum(subset) + feature_nums)
    return priors,likelihood    
  def predict(self,priors,likelihood):
    le = LabelEncoder()
    y = le.fit_transform(self.C) 
    class_type = np.unique(y)
    class_nums = class_type.shape[0]
    # make prediction on test set
    predictions = np.zeros(self.test.shape[0], dtype = np.int)
    for index1, document in enumerate(self.test):
        
        # stores the p(C|D) for each class
        posteriors = np.zeros(class_nums)

        # compute p(C = k|D) for the document for all class
        # and return the predicted class with the maximum probability
        for c in range(class_nums):

            # start with p(C = k)
            posterior = np.log(priors[c])
            likelihood_subset = likelihood[c, :]

            # compute p(D∣C = k)
            prob = document * np.log(likelihood_subset)
            posterior += np.sum(prob)
            posteriors[c] = posterior

        # compute the maximum p(C|D)
        prediction = np.argmax(posteriors)
        predictions[index1] = prediction
    
    # convert the prediction to the original class label
    predicted_class = le.inverse_transform(predictions)
    return predicted_class

In [None]:
X, y = twi['Tweets'], twi['sentiment']

In [None]:
X_train, X_test, y_train, y_test = X[:50], X[50:], y[:50], y[50:]

In [None]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


In [None]:
NB=multinomialNB(X_train_dtm,y_train,X_test_dtm)
NBprob=NB.fit()
prior=NBprob[0]
likelihood=NBprob[1]


In [None]:
y_pred =NB.predict(prior,likelihood)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [None]:
#performance measures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
#sklearn classifier
from sklearn.naive_bayes import MultinomialNB
encoder = LabelEncoder()
y_train_NB= encoder.fit_transform(y_train)
classifier = MultinomialNB()
classifier.fit(X_train_dtm, y_train_NB)

In [None]:
y_pred_NB=classifier.predict(X_test_dtm) 
y_test_NB=encoder.fit_transform(y_test)

In [None]:
#performance measures for sklearn model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test_NB,y_pred_NB))
print(classification_report(y_test_NB,y_pred_NB))
print(accuracy_score(y_test_NB, y_pred_NB))

In [None]:
#roc curve for each class using a sklearn model
from yellowbrick.classifier.rocauc import roc_auc
roc_auc(classifier, X_train_dtm, y_train_NB, X_test=X_test_dtm, y_test=y_test_NB, classes=['negative','neutral','positive'])