# Sentiment Analysis of Twitter Data

In [None]:
import tweepy
from tweepy import OAuthHandler 

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

import config
import pandas as pd
import re

## Fetch Twitter Data

In [None]:
def authorize_twitter_api():
    auth = OAuthHandler(config.keys['consumer_api_key'], config.keys['consumer_api_secret_key']) 
    auth.set_access_token(config.keys['access_token'], config.keys['access_token_secret']) 
    return tweepy.API(auth)

twitter_api = authorize_twitter_api()
twitter_api

In [None]:
def fetch_tweets(keyword, no_of_tweets=10):
    return twitter_api.search(keyword, count = no_of_tweets) 

In [None]:
# Use these below variables to try out with different keywords
keyword = 'india'
no_of_tweets = 20

In [None]:
tweets = [tweet._json for tweet in fetch_tweets(keyword, no_of_tweets)]
tweets_df = pd.DataFrame(tweets)
print(tweets_df.shape)
tweets_df.head()

In [None]:
# Retaining only useful columns
tweets_df = tweets_df[['id', 'text', 'entities']]
tweets_df.head()

## Pre-processing

In [None]:
# Remove unnecessary characters and also non-english characters
def pre_process_tweet(tweet): 
        return ' '.join(re.sub("(RT)?\s?(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split())

def remove_non_ascii(tweet): 
    return "".join(ch for ch in tweet if ord(ch)<128)

In [None]:
tweets_df['clean_text'] = tweets_df['text'].apply(pre_process_tweet)
tweets_df['clean_text'] = tweets_df['clean_text'].apply(remove_non_ascii)
tweets_df['sentences'] = tweets_df['clean_text'].apply(tokenize.sent_tokenize)
tweets_df.head()

## Assign Sentiments to the tweets

In [None]:
sid = SentimentIntensityAnalyzer()
sentiment_scores = []
sentiments = []
for sentence in tweets_df.sentences:
    ss = sid.polarity_scores(sentence)
    sentiment_scores.append(ss['compound'])
    sentiments.append("Positive" if s>0.1 else "negative" if s < -0.1 else "neutral")
tweets_df['sentiment_scores'] = sentiment_scores
tweets_df['sentiment'] = sentiments
tweets_df.head()