# Part I: Tweet Extraction and Sentiment Analysis

## Libraries

In [4]:
import pandas as pd
import numpy as np
import tweepy
import re
import string
import spacy
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from datetime import datetime, timedelta

## Vectorizer & Model

In [5]:
loaded_model = pickle.load(open('modelo_final.sav', 'rb'))

In [11]:
loaded_class = pickle.load(open('modelo_lemm.sav', 'rb'))

In [75]:
def classify_tweets(tweet):
    
    tweets_vector = loaded_class.transform(tweet)
    result_labels = loaded_model.predict(tweets_vector)
    
    return print(
        'Overall negative sentiment: {:.1%}'.format(np.count_nonzero(result_labels == 0) / len(result_labels)),
        'Overall positive sentiment: {:.1%}'.format(np.count_nonzero(result_labels == 4) / len(result_labels)), sep='\n')

In [44]:
def get_labels(tweet):
    
    tweets_vector = loaded_class.transform(tweet)
    result_labels = loaded_model.predict(tweets_vector)
    
    return result_labels

## Text Preprocessing

In [8]:
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en', disable=['parser', 'ner'])
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

In [9]:
def custom_tokenizer(document):
    
    doc_spacy = en_nlp(document)  
    
    return [token.lemma_ for token in doc_spacy]

In [10]:
def preprocess_tweet_text(tweet):

    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#','', tweet)
    tweet = re.sub(r'^RT', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r'[0-9]+', '', tweet)
    
    return tweet

## Tweet Extraction

In [145]:
def get_tweets_labels(keyword):
    
    credentials = pd.read_csv('/Users/joaootaviomeirellesratton/desktop/credentials.csv')
    
    authenticate = tweepy.OAuthHandler(credentials['key'][0], credentials['key'][1])
    authenticate.set_access_token(credentials['key'][2], credentials['key'][3])
    api = tweepy.API(authenticate, wait_on_rate_limit = True)
    
    end_date = datetime.strftime(datetime.now(), '%Y-%m-%d')
    start_date = datetime.strftime(datetime.now() - timedelta(7), '%Y-%m-%d')
    
    try:
        fetched_tweets = api.search(keyword, lang='en', count=15, since=start_date, until=end_date)
        
        print('Fetched ' + str(len(fetched_tweets)) + ' tweets for the string: ' +  keyword)
        print('From ' + start_date + ' To ' + end_date)
        
        tweet_list = [result.text for result in fetched_tweets]
        
        results = list(map(preprocess_tweet_text, tweet_list))
        
        labels = get_labels(results)
        
        tuples = list(zip(results, labels))
        
        df = pd.DataFrame(tuples, columns=['Tweet', 'Sentiment'])
        
        map_labels = {0 : 'Negative', 4 : 'Positive'}
        
        pd.set_option('display.max_colwidth', 200)
        
        return df.replace({'Sentiment' : map_labels})
    
    except:
        return print('Something went wrong...')

In [71]:
def get_tweets(keyword):
    
    credentials = pd.read_csv('/Users/joaootaviomeirellesratton/desktop/credentials.csv')
    
    authenticate = tweepy.OAuthHandler(credentials['key'][0], credentials['key'][1])
    authenticate.set_access_token(credentials['key'][2], credentials['key'][3])
    api = tweepy.API(authenticate, wait_on_rate_limit = True)
    
    end_date = datetime.strftime(datetime.now(), '%Y-%m-%d')
    start_date = datetime.strftime(datetime.now() - timedelta(7), '%Y-%m-%d')
    
    try:
        fetched_tweets = api.search(keyword, lang='en', count=15, since=start_date, until=end_date)
        
        print('Fetched ' + str(len(fetched_tweets)) + ' tweets for the string: ' +  keyword)
        print('From ' + start_date + ' To ' + end_date)
        
        tweet_list = [result.text for result in fetched_tweets]
        
        results = list(map(preprocess_tweet_text, tweet_list))
        
        return classify_tweets(results)
    
    except:
        return print('Something went wrong...')

## User Input

In [149]:
user_input = input('Enter a search keyword: ')
get_tweets(user_input)

Enter a search keyword: soccer
Fetched 15 tweets for the string: soccer
From 2020-05-15 To 2020-05-22
Overall negative sentiment: 46.7%
Overall positive sentiment: 53.3%


In [150]:
user_input = input('Enter a search keyword: ')
get_tweets_labels(user_input)

Enter a search keyword: soccer
Fetched 15 tweets for the string: soccer
From 2020-05-15 To 2020-05-22


Unnamed: 0,Tweet,Sentiment
0,When I was younger my dad didn’t let me play soccer cause it’s a “male” sport and I had to raise my a…,Negative
1,I ain’t no soccer player but ain’t that extremely risky Lmaooo,Negative
2,See what our freshmen have been up to lately\n\n🗞️ \n\nGoDuke 🔵😈⚽️,Positive
3,Soccer,Negative
4,Randomly remembered that time we got busted by the cops after the vigil because we paused for Su to ta…,Negative
5,put soccer skins out every one wants them,Negative
6,Another profile for you on this Thursday this from GV\n\nShe’s a Lady Jaguars senior who says that she is very thankful fo…,Positive
7,Congrats and good luck to Kyle Patterson on continuing his soccer career at ICC this fall …,Positive
8,I mean youre not a girl growing up in Ireland if you havent done this And its…,Negative
9,I won the It Is Now achievement in Sensible World of Soccer for points on TrueAchievements,Positive
