# Twitter Sentiment Analysis
## Environment

In [12]:
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import numpy as np
import re 
import ipywidgets as widgets
import matplotlib.pyplot as plt
from IPython.display import display
import io
plt.style.use('fivethirtyeight')

In [13]:
# API keys and tokens
consumer_key = "1oXtkBpLdErIe1brgMT2Gvn6b"
consumer_secret = "LT9GdsIhDl9luNE5lG1KDuITyerBMkJyJljSybZ41GYCr6U5g3"
access_token = "1526155687608594432-HNI9l5Ej8Vc5kbnFt8SgZH6c2Ycm9N"
access_token_secret = "KXFjY6R3HCNM7OfklXYuCxImPEBg5GfeTOPFef64H4xYz"

## Access Twitter API

In [14]:
# Authenticate to twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

client = tweepy.API(auth, wait_on_rate_limit = True)

## Data Cleaning Function

In [15]:
# Function for removing unnecessary details from tweets
def clean_data(text):
    text = text.lower()
    text = re.sub("@[\w]*", "", text)
    text = re.sub("http\S+", "", text)
    text = re.sub("[^a-zA-Z#]", " ", text)
    text = re.sub("#", "", text)
    text = re.sub("rt", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

## Analysis Functions

In [16]:
# Function for calculating polarity
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

# Function for calculating subjectivity
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [17]:
# Function for categorizing sentiment of tweet
def get_analysis(score):
    if score < 0:
        return "Negative"
    if score == 0:
        return "Neutral"
    else:
        return "Positive"

In [18]:
# Widgets for input
keyword=widgets.Text(
    placeholder='Type something',
    description='Keyword:',
    disabled=False
)

number=widgets.Text(
    placeholder='Number of tweets',
    description='Count:',
    disabled=False
)

In [19]:
# Widgets for output
generate_scores = widgets.Button(description="Generate")
output = widgets.Output()
pie_output = widgets.Output()
scatter_output = widgets.Output()
word_output = widgets.Output()

## Model Data

In [20]:
@output.capture()
# When user clicks the search button, it grabs tweets matching the keyword
# and then generates a list containing their polarity score and sentiment
def on_button_clicked(b):
    tweets = client.search_tweets(keyword.value, count=number.value, lang="en")
    df = pd.DataFrame( [tweet.text for tweet in tweets] , columns=['Tweets'])
    df['Tweets'] = df['Tweets'].apply(clean_data)
    df['Polarity'] = df['Tweets'].apply(get_polarity) 
    df['Analysis'] = df['Polarity'].apply(get_analysis)
    
    # Calculate percentage of positive tweets
    pos_tweets = df[df.Analysis == 'Positive']
    
    pos_percentage = round( (pos_tweets.shape[0]/df.shape[0])*100, 1)
    
    # Calculate percentage of negative tweets
    neg_tweets = df[df.Analysis == 'Negative']
    
    neg_percentage = round( (neg_tweets.shape[0]/df.shape[0])*100, 1)
    
    print("Positive percentage = ", pos_percentage, '%')
    print("Negative percentage = ", neg_percentage, '%')
    print(df)
    
    # Plot pie chart
    with pie_output:
        df['Analysis'].value_counts().plot(kind='pie')
        plt.title("Sentiment Analysis")
        plt.show()
        
    # Plot scatterplot
    with scatter_output:
        plt.figure(figsize=(8,6))
        df['Subjectivity'] = df['Tweets'].apply(get_subjectivity) 
        for i in range(0, df.shape[0]):
            plt.scatter(df['Polarity'][i], df['Subjectivity'][i], color='Blue')
        plt.title("Sentiment Analysis")
        plt.xlabel('Polarity')
        plt.ylabel('Subjectivity')
        plt.show()
        
    # Plot word frequency bar graph
    with word_output:
        # Split tweets
        split_tweets = str(df['Tweets']).split()
        
        cleaned_tweets = []
        
        filtered_tweets = []
        stop_words = set(stopwords.words('english'))
    
        # Remove stopwords
        for w in split_tweets: 
            if not w in stop_words and w != '...': 
                filtered_tweets.append(w)
                
        # Get 10 most common words
        counter = Counter(filtered_tweets)
        most_occur = counter.most_common(10) 
        
        x1 = [x[0] for x in most_occur]
        y1 = [x[1] for x in most_occur]
        
        # Plot bar graph
        plt.figure(figsize=(10,6)) 
        plt.barh(x1, y1)
        plt.title("Word Frequency Analysis")
        plt.xlabel('Frequency')
        plt.ylabel('Words')
        plt.show()
    
generate_scores.on_click(on_button_clicked)

## Interactive Query

In [21]:
# Display dashboard and widgets
search_widgets = widgets.HBox( [keyword, number] )

tabs = widgets.Tab([output, pie_output, scatter_output, word_output])
tabs.set_title(0, 'Dataset')
tabs.set_title(1, 'Pie Chart')
tabs.set_title(2, 'Scatterplot')
tabs.set_title(3, 'Word Frequency')
dashboard = widgets.VBox([search_widgets, tabs])
display(dashboard)
display(generate_scores)

VBox(children=(HBox(children=(Text(value='', description='Keyword:', placeholder='Type something'), Text(value…

Button(description='Generate', style=ButtonStyle())

## Model Accuracy

In [31]:
pos_count = 0
pos_correct = 0

with io.open('positive.txt', encoding='latin-1') as f:
    for sample in f.read().split('\n'):
        analysis = TextBlob(sample)
        if analysis.sentiment.polarity > 0:
            pos_correct += 1
        pos_count +=1


neg_count = 0
neg_correct = 0

with io.open('negative.txt', encoding='latin-1') as f:
    for sample in f.read().split('\n'):
        analysis = TextBlob(sample)
        if analysis.sentiment.polarity <= 0:
            neg_correct += 1
        neg_count +=1

print("Positive accuracy = {}% via {} samples".format(round((pos_correct/pos_count*100.0), 1), pos_count))
print("Negative accuracy = {}% via {} samples".format(round((neg_correct/neg_count*100.0), 1), neg_count))

Positive accuracy = 71.1% via 5332 samples
Negative accuracy = 55.9% via 5332 samples
