## Importing python libraries
1. To clean up the data. ex. removing stopwords, lemmentizing, tokenizing of tweet etc.
2. Geocoder API to extract the formatted address and (longitude, latitude) for the tweet.

In [1]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
import json
import math
import re
import string
import pickle
from nltk.corpus import stopwords
import csv
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from autocorrect import spell
import itertools
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import argparse

[nltk_data] Downloading package wordnet to /home/hadoop/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Code to preprocess the tweet text

In [2]:
def isNaN(num):
    return num != num

def preprocessTweet(tweet):
    try:
        tokenizer = TweetTokenizer()
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        stop_words_english = set(stopwords.words('english'))
        stop_words_english.remove("not") 
        if isNaN(tweet):
            return tweet
        tweet = tweet.encode('utf-16', 'surrogatepass').decode('utf-16')

        #Remove Urls 

        tweet.replace("don't","do not")
        tweet.replace("can't","can not")
        tweet.replace("cant","can not")
        tweet.replace("dont","do not")
        tweet.replace("isn't","is not")
        tweet.replace("won't","will not")
        tweet.replace("shouldn't","should not")
        tweet.replace("wouldn't","would not")

        tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
        tweet = re.sub(r"http\S+", "", tweet) 

        tokens = tokenizer.tokenize(tweet)

        #Capture retweets
        if tokens[0] == "RT":
            retweet = tokens[1][1:]
            tokens.remove("RT")
            tokens = tokens[1:]

        elif tokens[0] != "RT":
            retweet = ""

        #Remove the final ellipses in a tweet
        tokens = tokens[:-1]

        #Remove words with numbers in them
        tweet = ' '.join(s for s in tokens if not any(c.isdigit() for c in s))
        tweet = re.sub(r'([^a-zA-Z0-9])\1{3,}',"", tweet)

        tokens = tokenizer.tokenize(tweet)
        remove_tokens = []

        for token in tokens:
            if token in string.punctuation:
                remove_tokens.append(token)
            if token[0] == "#":
                remove_tokens.append(token)
            if token[0] == "@":
                remove_tokens.append(token)
            if not token.isalpha():
                remove_tokens.append(token)

        cleaned_tokens = [token for token in tokens if token not in remove_tokens]

        cleaned_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]
        cleaned_tokens = [spell(token) for token in cleaned_tokens]
        cleaned_tokens = [w.lower() for w in cleaned_tokens if not w.lower() in stop_words_english]
        #cleaned_tokens = [stemmer.stem(token) for token in cleaned_tokens]
        return ' '.join(cleaned_tokens)
    except:
        return tweet


### Code to extract geolocation
1. preprocess the user location by removing leading and trailing white space, converting it to lower case, add `india` if the text doesn't contain it.
2. get_coordinate function returns the formatted address string, (longitude, latitude) for the preprocessed address.

In [3]:
geolocator = Nominatim(user_agent="m-app")

def preprocessLocation(location, geo_location):
    loc = ''
    try: 
        coordinates = []
        if isNaN(geo_location) == False:
            loc = location + ',' + geo_location
        else:
            loc = location
        if isNaN(loc):
            return loc

        if 'india' not in loc.lower():
            loc = loc + ', ' + 'India'
        return loc.strip().lower()
    except:
        return loc

def get_coordinate(loc):
    try:
        c = geolocator.geocode(loc)
        if c != None:
            return c
    except :
        None

### Reading the scraped input data
Process the input location from the scraped input data.

In [4]:
import pandas as pd
df = pd.read_csv('election_march.csv').head(500)
df['processed_geolocation'] = df.apply(lambda x: preprocessLocation(x['location'], x['geo_location']), axis=1)

### Convert the pre-processed input address into a set

In [5]:
geoloc_set = set(df['processed_geolocation'])

### Loop through pre-processed input address set to extract [formated address, (latitude, longitude)] and append the values in a dictionary.

In [None]:
itr = 0
geocode = {}
for i in geoloc_set:
    var = get_coordinate(i)
#     print("iter {}: {}".format(itr, var))
    geocode[i] = var
    itr += 1

### get_state function extracts the state name from the input formated address.

In [7]:
states = ['Andaman and Nicobar', 'Andhra Pradesh', 'Arunachal Pradesh', \
          'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra and Nagar Haveli', \
          'Daman and Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', \
          'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya Pradesh', \
          'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Orissa', 'Puducherry', 'Punjab', \
          'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttaranchal', \
          'West Bengal']

from string import digits
import re
remove_digits = str.maketrans('', '', digits)

def get_state(address):
    tmp = ''
    tmp_address = address.translate(remove_digits).strip('India')
    for i in reversed(tmp_address.split(',')):
        if len(i.strip(' ')) > 1:
            tmp = i.strip(' ')
            break
    if tmp != '':
        for state in states:
            if (tmp.lower().replace(' ', '') in state.lower().replace(' ', '')) or (state.lower().replace(' ', '') in tmp.lower().replace(' ', '')):
                return state
    return None

### Function returns the Geo-location extracted through geocoder API for input pre-processed user address

In [8]:
def func(geolocation):
    try:
        return geocode[geolocation]
    except:
        return 'False'

In [9]:
df1 = df[isNaN(df['processed_geolocation']) == False]
df1['func'] = df1.apply(lambda x: func(x['processed_geolocation']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
df2 = df1[df1.apply(lambda x: not(x['func'] is None), axis=1)].reset_index()

### Function to extract the retweet username from the tweet text.

In [11]:
import re

def get_retweet_user(text_retweet):
    try:
        user = re.match(r"RT @(.*?):.*", text_retweet.strip()).group(1)
        if len(user) > 2:
            return user
        else:
            return 'DUMMY'
    except:
        return  'DUMMY'

def get_tweet(tweet, retweet):
    if isNaN(tweet):
        return retweet.strip('\n').strip('"');
    else:
        return tweet.strip('\n').strip('"');


### Extract the setiment for the tweet using TextBlob python library

In [12]:
#Sentiment Analysis
from textblob import TextBlob

def sentiment(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

df2['tweet'] = df2.apply(lambda x: preprocessTweet(get_tweet(x['text'], x['text_retweet'])), axis=1)
df2['Polarity'] = df2.apply(lambda x: sentiment(x['tweet'])[0], axis=1)
#label
sentiment_scores_tb = [round(TextBlob(text).sentiment.polarity, 3) for text in df2['tweet']]
sentiment_category_tb = ['positive' if score > 0 
                             else 'negative' if score < 0 
                                 else 'neutral' 
                                     for score in sentiment_scores_tb]
df2['sentiment'] = pd.DataFrame({'sentiment': sentiment_category_tb})

In [13]:
df2['address'] =  df2.apply(lambda x: x['func'].address, axis=1)
df2['state'] =  df2.apply(lambda x: get_state(x['func'].address), axis=1)
df2['lat'] =  df2.apply(lambda x: x['func'].latitude, axis=1)
df2['longi'] =  df2.apply(lambda x: x['func'].longitude, axis=1)
df2['retweet_user'] = df2.apply(lambda x: get_retweet_user(x['text_retweet']), axis=1)
df2['weight'] = 1

df2['party'] = df2.apply(lambda x: 'Congress' if ('congress' in str(x['tweet']).lower() or 'congress' in str(x['hashtags']).lower()) else 'BJP', axis=1)
df2['congress'] = df2.apply(lambda x: 1 if x['party'] == 'Congress' else 0, axis=1)
df2['bjp'] = df2.apply(lambda x: 1 if x['party'] == 'BJP' else 0, axis=1)
df2['positive'] = df2.apply(lambda x: 1 if x['sentiment'] == 'positive' else 0, axis=1)
df2['negative'] = df2.apply(lambda x: 1 if x['sentiment'] == 'negative' else 0, axis=1)
df2['neutral'] = df2.apply(lambda x: 1 if x['sentiment'] == 'neutral' else 0, axis=1)

In [14]:
df3 = df2[df2.apply(lambda x: not(x['state'] is None), axis=1)][['created_date','screen_name','retweet_user','weight','state','lat','longi','party','sentiment','congress','bjp','positive','negative','neutral','tweet']]

In [44]:
df3.to_csv('gData1.csv', header=True, index=False)