# Sentiment Analysis of Twitter Data

In [27]:
import tweepy
from tweepy import OAuthHandler 

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

import config
import pandas as pd
import re

## Fetch Twitter Data

In [3]:
def authorize_twitter_api():
    auth = OAuthHandler(config.keys['consumer_api_key'], config.keys['consumer_api_secret_key']) 
    auth.set_access_token(config.keys['access_token'], config.keys['access_token_secret']) 
    return tweepy.API(auth)

twitter_api = authorize_twitter_api()
twitter_api

<tweepy.api.API at 0x144ddad0>

In [11]:
def fetch_tweets(keyword, no_of_tweets=10):
    return twitter_api.search(keyword, count = no_of_tweets) 

In [22]:
# Use these below variables to try out with different keywords
keyword = 'india'
no_of_tweets = 20

In [24]:
tweets = [tweet._json for tweet in fetch_tweets(keyword, no_of_tweets)]
tweets_df = pd.DataFrame(tweets)
print(tweets_df.shape)
tweets_df.head()

(20, 26)


Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,possibly_sensitive
0,Fri Sep 20 19:59:09 +0000 2019,1175137340551024641,1175137340551024641,RT @TSUpdatesNYBU: 🎧 | Most streamed internati...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,{'created_at': 'Fri Sep 20 18:51:50 +0000 2019...,False,68,0,False,False,en,
1,Fri Sep 20 19:59:09 +0000 2019,1175137339523256320,1175137339523256320,RT @VatsalyaIndia: નથી માતાપિતા ખોટા કે નથી બા...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'gu', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,,,{'created_at': 'Fri Sep 20 19:35:50 +0000 2019...,False,1,0,False,False,gu,
2,Fri Sep 20 19:59:09 +0000 2019,1175137339175129088,1175137339175129088,@MuktaSh50437862 @Ananth25955752 @WhatsThereIn...,True,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",1.175103e+18,1.1751026313893435e+18,...,,,,False,0,0,False,False,en,
3,Fri Sep 20 19:59:09 +0000 2019,1175137338910855168,1175137338910855168,RT @upma23: Odisha IAS Replaces Plastic Plates...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,,,{'created_at': 'Fri Sep 20 07:00:54 +0000 2019...,False,109,0,False,False,en,False
4,Fri Sep 20 19:59:08 +0000 2019,1175137337010835457,1175137337010835457,RT @iamarunjain: @AmitAroraNJ @AnsuMynation @N...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,,,{'created_at': 'Fri Sep 20 18:25:16 +0000 2019...,False,5,0,False,False,en,


In [25]:
# Retaining only useful columns
tweets_df = tweets_df[['id', 'text', 'entities']]
tweets_df.head()

Unnamed: 0,id,text,entities
0,1175137340551024641,RT @TSUpdatesNYBU: 🎧 | Most streamed internati...,"{'hashtags': [], 'symbols': [], 'user_mentions..."
1,1175137339523256320,RT @VatsalyaIndia: નથી માતાપિતા ખોટા કે નથી બા...,"{'hashtags': [], 'symbols': [], 'user_mentions..."
2,1175137339175129088,@MuktaSh50437862 @Ananth25955752 @WhatsThereIn...,"{'hashtags': [], 'symbols': [], 'user_mentions..."
3,1175137338910855168,RT @upma23: Odisha IAS Replaces Plastic Plates...,"{'hashtags': [], 'symbols': [], 'user_mentions..."
4,1175137337010835457,RT @iamarunjain: @AmitAroraNJ @AnsuMynation @N...,"{'hashtags': [], 'symbols': [], 'user_mentions..."


## Pre-processing

In [31]:
# Remove unnecessary characters and also non-english characters
def pre_process_tweet(tweet): 
        return ' '.join(re.sub("(RT)?\s?(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split())

def remove_non_ascii(tweet): 
    return "".join(ch for ch in tweet if ord(ch)<128)

tweets_df['clean_text'] = tweets_df['text'].apply(pre_process_tweet)
tweets_df['clean_text'] = tweets_df['clean_text'].apply(remove_non_ascii)
tweets_df.head()

Unnamed: 0,id,text,entities,clean_text
0,1175137340551024641,RT @TSUpdatesNYBU: 🎧 | Most streamed internati...,"{'hashtags': [], 'symbols': [], 'user_mentions...",Most streamed international female artists in ...
1,1175137339523256320,RT @VatsalyaIndia: નથી માતાપિતા ખોટા કે નથી બા...,"{'hashtags': [], 'symbols': [], 'user_mentions...",/
2,1175137339175129088,@MuktaSh50437862 @Ananth25955752 @WhatsThereIn...,"{'hashtags': [], 'symbols': [], 'user_mentions...",
3,1175137338910855168,RT @upma23: Odisha IAS Replaces Plastic Plates...,"{'hashtags': [], 'symbols': [], 'user_mentions...",Odisha IAS Replaces Plastic Plates With Sal Le...
4,1175137337010835457,RT @iamarunjain: @AmitAroraNJ @AnsuMynation @N...,"{'hashtags': [], 'symbols': [], 'user_mentions...",_Mynation _MP _Delhi _net


## Assign Sentiments to the tweets

In [3]:
paragraph = "It was one of the worst movies I've seen, despite good reviews. Unbelievably bad acting!! Poor direction. VERY poor production. The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"
sentences = tokenize.sent_tokenize(paragraph)

sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')

It was one of the worst movies I've seen, despite good reviews.
compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0, Unbelievably bad acting!!
compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0, Poor direction.
compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0, VERY poor production.
compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0, The movie was bad.
compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0, Very bad movie.
compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0, VERY bad movie.
compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0, VERY BAD movie.
compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0, VERY BAD movie!
compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0, 

## Model Performance