### Tasks in part1: 
    1. collect the tweets data from crate.io
    2. prepare the data for sentiment analysis
    3. using naive bayes analyzer for the sentiment analysis
    
**- proposal for next step: **
    1. mining the tweets with frequency of term along with visualization
    2. put the tweet into geolocation and interactive maps
    3. search for specific keyword and filter out for the topic I want to explore more

In [105]:
import pandas as pd
import json
import numpy as np
from textblob import TextBlob

In [106]:
rawdata = pd.read_json(['tweets_0_.json', 'tweets_0_.json', 'tweets_0_.json', 'tweets_0_.json'], lines=True)
rawdata.head(3)

Unnamed: 0,created_at,id,retweeted,source,text,user
0,2017-08-01 00:46:11,892184651892224000,False,"<a href=""http://www.facebook.com/twitter"" rel=...",https://t.co/yb7EZLUdCf,"{u'verified': False, u'description': u'desde 1..."
1,2017-08-01 00:46:11,892184651896410112,False,"<a href=""http://twitter.com/download/iphone"" r...",RT @horrorcuIt: infernoooooo que horas vai lib...,"{u'verified': False, u'description': u'entrei ..."
2,2017-08-01 00:46:11,892184651929796608,False,"<a href=""https://about.twitter.com/products/tw...",RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,"{u'verified': False, u'description': u'🅰️pple..."


### Most of the tweets consist of different languages, thus I need to filter out only English tweets. I use TextBlob functionality to solve this problem. 

In [107]:
def find_lang(doc): 
    tb = TextBlob(doc)
    if len(tb.words)>5 and type(tb): 
        return tb.detect_language()
    else: 
        return 'none'

def doc_polarity(doc): 
    tb = TextBlob(doc) 
    return tb.sentiment.polarity

def doc_sub(doc): 
    tb  = TextBlob(doc)
    return tb.sentiment.subjectivity

### Due to lack of computation power, I use only 200 observations for now just to test the code. 

In [108]:
rawdata['lang']=rawdata['text'][:200].apply(find_lang)
rawdata['lang'].head(3)

0    none
1      pt
2      en
Name: lang, dtype: object

In [145]:
df= rawdata[rawdata['lang']=='en']
df = df.reset_index()
df = df[['text', "retweeted",'user', 'source']]
df.head(3)

Unnamed: 0,text,retweeted,user,source
0,RT @Sammyy02K: Urban jungle🏙🌴👑 https://t.co...,False,"{u'verified': False, u'description': u'🅰️pple...","<a href=""https://about.twitter.com/products/tw..."
1,@vinsmokesanjis I'm so tempted to look but I'm...,False,"{u'verified': False, u'description': u'One tim...","<a href=""http://twitter.com/download/iphone"" r..."
2,Devers is 3-3 tonight. Pretty good but also ma...,False,"{u'verified': False, u'description': u'IG @sta...","<a href=""http://twitter.com/download/iphone"" r..."


In [146]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import string
 
punctuation = list(string.punctuation)

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en') + punctuation + ['rt', 'via', 'RT', 'http']

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [147]:
#remove emoticon, URL, hash-tag from the text
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)    
    
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

def encoding(text):   
    t = text.encode('utf-8')
    return t

In [182]:
# list for tokenized documents in loop
texts = []

# loop through document list
for i in lower_text:
    
    # clean and tokenize document string
    #raw = i.lower()
    tokens = tokenizer.tokenize(i)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i.decode("utf8","ignore") for i in stopped_tokens]
    
    # remove the emoticons, url and hashtag
    final_stopped_tokens = [i for i in stopped_tokens]
    
        
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in final_stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [183]:
dictionary = corpora.Dictionary(texts)

In [184]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [185]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [186]:
print(ldamodel.print_topics(num_topics=2, num_words=3))

In [197]:
ldamodel.print_topics(num_topics=3)

[(0,
  u'0.015*"t" + 0.012*"s" + 0.012*"3" + 0.008*"don" + 0.008*"give" + 0.008*"go" + 0.005*"someth" + 0.005*"peopl" + 0.005*"Thi" + 0.005*"wear"'),
 (1,
  u'0.181*"" + 0.062*"t" + 0.058*"http" + 0.058*"co" + 0.008*"The" + 0.007*"jungl" + 0.007*"68OaA7uIt2" + 0.007*"Urban" + 0.007*"Sammyy02K" + 0.006*"gt"'),
 (2,
  u'0.035*"I" + 0.015*"m" + 0.012*"" + 0.011*"get" + 0.011*"want" + 0.008*"like" + 0.008*"Are" + 0.008*"video" + 0.008*"What" + 0.007*"sex"')]

In [200]:
# Load the corpus and dictionary
import pyLDAvis.gensim

In [201]:
# First LDA model with 10 topics, 10 passes, alpha = 0.001
#lda = models.LdaModel.load('data/alexip_followers_py27_t10_p10_a001_b01.lda')
followers_data =  pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(followers_data)