### Tasks in part1: 
    1. collect the tweets data from crate.io
    2. prepare the data for sentiment analysis
    3. using naive bayes analyzer for the sentiment analysis
    
**- proposal for next step: **
    1. mining the tweets with frequency of term along with visualization
    2. put the tweet into geolocation and interactive maps
    3. search for specific keyword and filter out for the topic I want to explore more

In [1]:
import pandas as pd
import json
import numpy as np
from textblob import TextBlob

In [2]:
rawdata = pd.read_json('tweets_0_.json', lines=True)
rawdata.head(3)

Unnamed: 0,created_at,id,retweeted,source,text,user
0,2017-08-01 00:46:11,892184651892224000,False,"<a href=""http://www.facebook.com/twitter"" rel=...",https://t.co/yb7EZLUdCf,"{u'verified': False, u'description': u'desde 1..."
1,2017-08-01 00:46:11,892184651896410112,False,"<a href=""http://twitter.com/download/iphone"" r...",RT @horrorcuIt: infernoooooo que horas vai lib...,"{u'verified': False, u'description': u'entrei ..."
2,2017-08-01 00:46:11,892184651929796608,False,"<a href=""https://about.twitter.com/products/tw...",RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,"{u'verified': False, u'description': u'🅰️pple..."


### Most of the tweets consist of different languages, thus I need to filter out only English tweets. I use TextBlob functionality to solve this problem. 

In [3]:
def find_lang(doc): 
    tb = TextBlob(doc)
    if len(tb.words)>5 and type(tb): 
        return tb.detect_language()
    else: 
        return 'none'

def doc_polarity(doc): 
    tb = TextBlob(doc) 
    return tb.sentiment.polarity

def doc_sub(doc): 
    tb  = TextBlob(doc)
    return tb.sentiment.subjectivity

### Due to lack of computation power, I use only 200 observations for now just to test the code. 

In [4]:
rawdata['lang']=rawdata['text'][:200].apply(find_lang)
rawdata['lang'].head(3)

0    none
1      pt
2      en
Name: lang, dtype: object

In [5]:
df= rawdata[rawdata['lang']=='en']
size= str(len(rawdata[rawdata['lang']=='en']))
print ("English Tweets have " + size + " out of 200.")
df = df.reset_index()
df = df[['text', "retweeted",'user', 'source']]
df.head(3)

English Tweets have 70 out of 200.


Unnamed: 0,text,retweeted,user,source
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw..."
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r..."
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r..."


In [6]:
# the encode_doc give out the same text as pd.read_json
# it might solve issue with windows
import codecs

def encode_doc(doc): 
    encoded_doc = doc.encode('utf-8', 'ignore')
    return encoded_doc

def decode_doc(doc): 
    decoded_doc = doc.decode('utf-8', 'ignore')
    return decoded_doc

In [7]:
#Return the polarity score as a float within the range [-1.0, 1.0]
df['polarity']= df['text'].apply(doc_polarity)

#Return the subjectivity score as a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
df['sub']= df['text'].apply(doc_sub)
df.head(3)

Unnamed: 0,text,retweeted,user,source,polarity,sub
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw...",0.0,0.0
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r...",0.0,0.0
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r...",0.316667,0.566667


In [8]:
#categorizing the range to subjectivity or objectivity
def categorize_sub(doc): 
    if doc == 1: 
        return "sub"
    elif doc <=0.99 and doc >=0.51: 
        return "highly_sub"
    elif doc == 0.5: 
        return "Sub&Obj"
    elif doc <= 0.49 and doc >= 0.1: 
        return "highly_obj"
    elif doc ==0: 
        return "obj"
    else: 
        return "none"

#categorizing the range to sentiment opinion
def categorize_polarity(doc): 
    if doc <= 1 and doc >=0.75: 
        return "positive"
    elif doc <=0.74 and doc >=0.10: 
        return "sightly_positive"
    elif doc == 0: 
        return "neutral"
    elif doc <= -0.1 and doc >= -0.74: 
        return "sightly_negative"
    elif doc <= -0.75 and doc >=-1: 
        return "negative"
    else: 
        return "none"

In [9]:
df['sentiment']= df['polarity'].apply(categorize_polarity)
df['subjectivity']= df['sub'].apply(categorize_sub)

In [10]:
df.head(3)

Unnamed: 0,text,retweeted,user,source,polarity,sub,sentiment,subjectivity
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw...",0.0,0.0,neutral,obj
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r...",0.0,0.0,neutral,obj
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r...",0.316667,0.566667,sightly_positive,highly_sub


In [65]:
def encoding(text):   
    t = text.encode('utf-8')
    return t

def decoding(text):   
    t = text.decode('utf-8')
    return t

df['textline'] = df.text.apply(encoding)

In [12]:
from textblob.sentiments import NaiveBayesAnalyzer

def naive(doc): 
    tb = TextBlob(doc, analyzer=NaiveBayesAnalyzer())
    return tb.sentiment

#runing naive bayes analyzer on the tweets
#caution: this will take awhile to finish running
df["naive"]= df['textline'].apply(naive)
df.head(4)

Unnamed: 0,text,retweeted,user,source,polarity,sub,sentiment,subjectivity,textline,naive
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw...",0.0,0.0,neutral,obj,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,"(pos, 0.5, 0.5)"
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r...",0.0,0.0,neutral,obj,@vinsmokesanjis I'm so tempted to look but I'm...,"(neg, 0.305610938099, 0.694389061901)"
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r...",0.316667,0.566667,sightly_positive,highly_sub,Devers is 3-3 tonight. Pretty good but also ma...,"(neg, 0.224207937713, 0.775792062287)"
3,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'20| Lov...","<a href=""https://about.twitter.com/products/tw...",0.0,0.0,neutral,obj,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,"(pos, 0.5, 0.5)"


In [13]:
#getting naive results to three columns of data frames
df['naive_sentiment']=df['naive'].str[0]
df['naive_ppos']= df['naive'].str[1].astype(float).round(2)
df['naive_pneg']= df['naive'].str[2].astype(float).round(2)

In [14]:
df.head(3)

Unnamed: 0,text,retweeted,user,source,polarity,sub,sentiment,subjectivity,textline,naive,naive_sentiment,naive_ppos,naive_pneg
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw...",0.0,0.0,neutral,obj,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,"(pos, 0.5, 0.5)",pos,0.5,0.5
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r...",0.0,0.0,neutral,obj,@vinsmokesanjis I'm so tempted to look but I'm...,"(neg, 0.305610938099, 0.694389061901)",neg,0.31,0.69
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r...",0.316667,0.566667,sightly_positive,highly_sub,Devers is 3-3 tonight. Pretty good but also ma...,"(neg, 0.224207937713, 0.775792062287)",neg,0.22,0.78


In [15]:
df.columns

Index([           u'text',       u'retweeted',            u'user',
                u'source',        u'polarity',             u'sub',
             u'sentiment',    u'subjectivity',        u'textline',
                 u'naive', u'naive_sentiment',      u'naive_ppos',
            u'naive_pneg'],
      dtype='object')

In [16]:
#create a new dataframe that contains only necessary features
new_df = df[['textline','retweeted','user','source', 'sentiment', 'subjectivity', 'naive_sentiment', 'naive_ppos', 'naive_pneg']]

In [17]:
new_df.head(3)

Unnamed: 0,textline,retweeted,user,source,sentiment,subjectivity,naive_sentiment,naive_ppos,naive_pneg
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw...",neutral,obj,pos,0.5,0.5
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r...",neutral,obj,neg,0.31,0.69
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r...",sightly_positive,highly_sub,neg,0.22,0.78


In [18]:
#split the text into tokens
from nltk.tokenize import word_tokenize

def tokenize_text(text): 
    token = word_tokenize(text)
    return token

list_token = new_df.textline.apply(tokenize_text)
list_token[:3]

0    [RT, @, Sammyy02K, :, Urban, jungle🏙🌴👑, htt...
1    [@, vinsmokesanjis, I, 'm, so, tempted, to, lo...
2    [Devers, is, 3-3, tonight, ., Pretty, good, bu...
Name: textline, dtype: object

In [22]:
#remove emoticon, URL, hash-tag from the text
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)    
    
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [23]:
list_token = new_df.textline.apply(preprocess)
list_token[:3]

0    [RT, @Sammyy02K, :, Urban, jungle, �, �, �, �,...
1    [@vinsmokesanjis, I'm, so, tempted, to, look, ...
2    [Devers, is, 3, -, 3, tonight, ., Pretty, good...
Name: textline, dtype: object

In [24]:
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'RT']

In [25]:
from collections import Counter
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

def term_stop(text): 
    for term in text: 
        if term not in stop:
            return term
        
#Doesn't work
def hastag(text): 
    for term in text: 
        if term.startWith('#', '@'): 
            Counter(text)

In [26]:
termstop_list_token = list_token.apply(term_stop)
print Counter(termstop_list_token)

Counter({'@Sammyy02K': 6, '@becoupIes': 4, '@spacerelapse': 2, 'Even': 1, 'Everybody': 1, '@MoeenAli': 1, '@KatrinaPierson': 1, '@FemaleKnows': 1, 'Please': 1, '@ubiquitousurn': 1, '@Iwtpoison': 1, '@tjchambersLA': 1, '@stagmetanoia': 1, '@kengarex': 1, '@PoIoOverHoes': 1, '@walidffs': 1, '@rembert': 1, '@wakandaho': 1, 'Recently': 1, '@kxxxyxng_': 1, '@ReliableSources': 1, '@vanessaxtran': 1, 'Omoooo': 1, '@bitchxiety': 1, '@Hita_eighthsun': 1, 'https://t.co/n8QbWxrASU': 1, '@qweenpush': 1, '@DearAuntCrabby': 1, '@Jeremy_Hunt': 1, 'exgirl': 1, '@Mendu_mza': 1, '@Biapmad': 1, '@kaiyah_baiyah': 1, '@theartfulman': 1, '@malik12_styles': 1, 'someone': 1, '#MTVHottest': 1, 'Secrets': 1, '@ChloeAngyal': 1, '@HXRLEEN': 1, '@Zendaya': 1, '@TheSoneSource': 1, '@avega7077': 1, '@realDonaldTrump': 1, 'Can': 1, '@neilpatel': 1, 'The': 1, '@BleepThePolice': 1, '@luxurydolans': 1, '@MikeKellyofEM': 1, 'Devers': 1, '@SourDiesBabe': 1, '@planetepics': 1, '@RealJheneAiko_': 1, 'ONLY': 1, '@vinsmokesan

In [70]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [1]:
# list for tokenized documents in loop
texts = []

# loop through document list
for i in new_df['textline']:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i.decode("utf8","ignore") for i in stopped_tokens]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

NameError: name 'new_df' is not defined

In [90]:
dictionary = corpora.Dictionary(texts)

In [91]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [92]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [97]:
print(ldamodel.print_topics(num_topics=2, num_words=3))

In [103]:
ldamodel.print_topics(num_topics=5)

[(0,
  u'0.029*"rt" + 0.011*"like" + 0.008*"back" + 0.008*"peopl" + 0.008*"vision" + 0.008*"get" + 0.004*"look" + 0.004*"high" + 0.004*"gonna" + 0.004*"time"'),
 (1,
  u'0.051*"t" + 0.043*"http" + 0.043*"co" + 0.033*"rt" + 0.011*"want" + 0.009*"s" + 0.006*"time" + 0.006*"sex" + 0.006*"girl" + 0.006*"get"'),
 (2,
  u'0.188*"" + 0.058*"t" + 0.054*"http" + 0.052*"co" + 0.036*"rt" + 0.009*"sammyy02k" + 0.009*"jungl" + 0.009*"68oaa7uit2" + 0.009*"urban" + 0.007*"m"')]