In [99]:
# import plotly.plotly as py
from plotly.graph_objs import *
import pandas as pd
import numpy as np
import scipy as sp
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet as wn
import plotly.tools as tls
import cufflinks as cf
import scipy.stats as stats
from plotly import __version__
import plotly.offline as py
from collections import Counter
import os

In [100]:
init_notebook_mode(connected=True)
cf.go_offline()


# Let's read Trump Twitter data

In [101]:
tweets = [pd.read_json('./data/trump/condensed_20%s.json'%str(i).zfill(2)).set_index('created_at') 
          for i in range(9,18)]

df = pd.concat(tweets)
df.head()

Unnamed: 0_level_0,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-12-23 17:38:18,12,6971079756,,False,28,Twitter Web Client,From Donald Trump: Wishing everyone a wonderfu...
2009-12-03 19:39:09,6,6312794445,,False,33,Twitter Web Client,Trump International Tower in Chicago ranked 6t...
2009-11-26 19:55:38,11,6090839867,,False,13,Twitter Web Client,Wishing you and yours a very Happy and Bountif...
2009-11-16 21:06:10,3,5775731054,,False,5,Twitter Web Client,Donald Trump Partners with TV1 on New Reality ...
2009-11-02 14:57:56,6,5364614040,,False,7,Twitter Web Client,"--Work has begun, ahead of schedule, to build ..."


# Let's build Empath Concepts as dictionary

In [102]:
concepts = {}
with open('./data/categories.tsv','r') as f:
    for l in f:
        cols = l.strip().split('\t')
        name = cols[0]
        terms = cols[1:]
        concepts[name] = {c:True for c in terms}

# Let's apply Concepts to Tweets
1. remove stopwords
2. lemmatize tweets
3. lowercase
3. count the length

In [103]:
# print stopwords.words('english')
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('I').lower()

i


In [104]:
stops = set(stopwords.words('english')+['.',',','!','?',';',':','[',']','\"'])
def lemmatize_tweets(t):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
    tokens = tokenizer.tokenize(t)
    result = []
    for w in tokens:
        lemma = lemmatizer.lemmatize(w)
        if lemma not in stops:
            result.append(lemma)
    return result
            
test = 'Hello, I am a boy.'
print lemmatize_tweets(test)

[u'hello', u'boy']


In [105]:
df['lemmatized'] = df.text.map(lambda x: lemmatize_tweets(x))
df['length'] = df.apply(lambda x: len(x.lemmatized), axis=1)
df.head()

Unnamed: 0_level_0,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text,lemmatized,length
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-12-23 17:38:18,12,6971079756,,False,28,Twitter Web Client,From Donald Trump: Wishing everyone a wonderfu...,"[donald, trump, wishing, everyone, wonderful, ...",18
2009-12-03 19:39:09,6,6312794445,,False,33,Twitter Web Client,Trump International Tower in Chicago ranked 6t...,"[trump, international, tower, chicago, ranked,...",16
2009-11-26 19:55:38,11,6090839867,,False,13,Twitter Web Client,Wishing you and yours a very Happy and Bountif...,"[wishing, happy, bountiful, thanksgiving]",4
2009-11-16 21:06:10,3,5775731054,,False,5,Twitter Web Client,Donald Trump Partners with TV1 on New Reality ...,"[donald, trump, partner, tv1, new, reality, se...",12
2009-11-02 14:57:56,6,5364614040,,False,7,Twitter Web Client,"--Work has begun, ahead of schedule, to build ...","[-, -, work, ha, begun, ahead, schedule, build...",16


In [106]:
def ratio_concept(t, concept_words):
    count = 0
    for w in t:
        if w in concept_words:
            count += 1
    return count*1.0/len(t)

for concept in concepts:
    df[concept] = df.lemmatized.map(lambda x: ratio_concept(x, concepts[concept]))

In [107]:
df.head()

Unnamed: 0_level_0,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text,lemmatized,length,help,...,negative_emotion,weapon,children,ocean,giving,contentment,writing,rural,positive_emotion,order
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-23 17:38:18,12,6971079756,,False,28,Twitter Web Client,From Donald Trump: Wishing everyone a wonderfu...,"[donald, trump, wishing, everyone, wonderful, ...",18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0
2009-12-03 19:39:09,6,6312794445,,False,33,Twitter Web Client,Trump International Tower in Chicago ranked 6t...,"[trump, international, tower, chicago, ranked,...",16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-26 19:55:38,11,6090839867,,False,13,Twitter Web Client,Wishing you and yours a very Happy and Bountif...,"[wishing, happy, bountiful, thanksgiving]",4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
2009-11-16 21:06:10,3,5775731054,,False,5,Twitter Web Client,Donald Trump Partners with TV1 on New Reality ...,"[donald, trump, partner, tv1, new, reality, se...",12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-02 14:57:56,6,5364614040,,False,7,Twitter Web Client,"--Work has begun, ahead of schedule, to build ...","[-, -, work, ha, begun, ahead, schedule, build...",16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
df2 = df.copy()
df2  = df.drop(df.columns[:9], axis=1)
df2.head()

Unnamed: 0_level_0,help,office,dance,money,wedding,valuable,domestic_work,sleep,medical_emergency,cold,...,negative_emotion,weapon,children,ocean,giving,contentment,writing,rural,positive_emotion,order
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-23 17:38:18,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0
2009-12-03 19:39:09,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-26 19:55:38,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
2009-11-16 21:06:10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-02 14:57:56,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
means = [(col, mean) for col, mean in zip(df2.columns, df2.mean())]
means.sort(key=lambda x: x[1], reverse=True )

data = [Bar(x=[col[0] for col in means], 
           y=[col[1] for col in means ])]
py.iplot(data)

In [110]:
top10_cat= [x[0] for x in means[:10]]
converted = df2[top10_cat].resample('A').mean()

In [111]:
converted.iplot(kind='bar', barmode='stack')

In [112]:
converted.iplot(kind='scatter')

In [113]:
converted.iplot(kind='area', fill=True, barmode='stack')

In [114]:
top_cat= [x[0] for x in means[:12]]
converted = df2[top_cat].resample('M').mean()
converted.iplot(subplots=True, shape=(4,3), shared_xaxes=True, legend=False, subplot_titles=True)

In [115]:
top_cat= [x[0] for x in means[:20]]
converted = df2[top_cat].resample('M').mean()
converted.iplot(subplots=True, shape=(5,4), shared_xaxes=True, legend=False, subplot_titles=True)

# Findings 

## Things he talks more
 1. giving
 2. leader
 3. politics

## Things he talks less
 1. business
 1. economics
 1. work
 1. trust
 1. friends
 1. money

## To Do
- Need to have statistical backup


In [116]:
counts = df2.resample('M').size()
counts.iplot()

# Trump vs. Hillary

In [117]:
df = pd.read_csv('./data/trump/tweets.csv')
df.columns
df['lemmatized'] = df.text.map(lambda x: lemmatize_tweets(x))
df['length'] = df.apply(lambda x: len(x.lemmatized), axis=1)
for concept in concepts:
    df[concept] = df.lemmatized.map(lambda x: ratio_concept(x, concepts[concept]))
df.head(10)

Unnamed: 0,id,handle,text,is_retweet,original_author,time,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,is_quote_status,...,negative_emotion,weapon,children,ocean,giving,contentment,writing,rural,positive_emotion,order
0,780925634159796224,HillaryClinton,The question in this election: Who can put the...,False,,2016-09-28T00:22:34,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0
1,780916180899037184,HillaryClinton,"Last night, Donald Trump said not paying taxes...",True,timkaine,2016-09-27T23:45:00,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,780911564857761793,HillaryClinton,Couldn't be more proud of @HillaryClinton. Her...,True,POTUS,2016-09-27T23:26:40,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.076923
3,780907038650068994,HillaryClinton,"If we stand together, there's nothing we can't...",False,,2016-09-27T23:08:41,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,780897419462602752,HillaryClinton,Both candidates were asked about how they'd co...,False,,2016-09-27T22:30:27,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,780893126605037568,realDonaldTrump,Join me for a 3pm rally - tomorrow at the Mid-...,False,,2016-09-27T22:13:24,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,780883582051123200,HillaryClinton,This election is too important to sit out. Go ...,False,,2016-09-27T21:35:28,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,780881075891531776,HillaryClinton,When Donald Trump goes low...register to vote:...,False,,2016-09-27T21:25:31,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,780876760086880256,realDonaldTrump,"Once again, we will have a government of, by a...",False,,2016-09-27T21:08:22,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,780874710049550336,HillaryClinton,3) Has Trump offered a single proposal to redu...,True,mcuban,2016-09-27T21:00:13,,,,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:

grouped = df.groupby('handle')
grouped_mean =  grouped[df.columns[30:]].mean()



In [119]:
grouped_mean.T.iplot(kind='bar')

In [120]:
grouped_mean.T.iplot(kind='bar', barmode='stack')

In [121]:
tweets = {}
for name, group in grouped:
    tweets[name] = [s for s in group.text]
    tweets[name] = ' '.join(tweets[name])

In [122]:
lemma_tweets = {}
for h in tweets:
    lemma_tweets[h] = lemmatize_tweets(tweets[h])

In [123]:
lemma_tweets.keys()

['HillaryClinton', 'realDonaldTrump']

In [124]:
def count_concept(tweets, concept_words, k):
    counter = Counter()
    count = 0
    for w in tweets:
        if w in concept_words:
            count += 1
            counter[w] += 1
    most_common = counter.most_common()
    
    top_k = most_common[:k-1]
    top_k.append(('etc',sum(d[1] for d in most_common[k-1:])))
    while len(top_k) < k:
        top_k.append(('n/a',0))
    return count, len(tweets) - count, top_k

def odd_ratio(concept_words):
    hillary_count, hillary_not_count, hillary_counter = count_concept(lemma_tweets['HillaryClinton'], concept_words)
    trump_count, trump_not_count, trump_counter = count_concept(lemma_tweets['realDonaldTrump'], concept_words)
    
    odd_ratio = (trump_count*1.0/hillary_count)/(trump_not_count*1.0/hillary_not_count)
    if odd_ratio >= 0 :
        return odd_ratio, trump_counter
    else:
        return -1/odd_ratio, hillary_counter

def odd_ratio2(concept_words):
    hillary_count, hillary_not_count, hillary_counter = count_concept(lemma_tweets['HillaryClinton'], concept_words,5)
    trump_count, trump_not_count, trump_counter = count_concept(lemma_tweets['realDonaldTrump'], concept_words,5)
    
    odd_ratio, pvalue = stats.fisher_exact([[trump_count, trump_not_count], [hillary_count, hillary_not_count]])
    if odd_ratio >= 1 :
        return odd_ratio, pvalue, trump_counter
    else:
        return -1/odd_ratio, pvalue, hillary_counter
    
trump_odds_ratio = []
for c in concepts:
    odd_ratio, pvalue, most_common = odd_ratio2(concepts[c])
    if abs(pvalue) < 0.05 and abs(odd_ratio)>2: 
        trump_odds_ratio.append( (c, odd_ratio, pvalue, most_common ))
trump_odds_ratio.sort(key=lambda x: x[1], reverse=False)      

In [125]:
data = [Bar( x=[x[1] for x in trump_odds_ratio],
           y=[x[0] for x in trump_odds_ratio],
           orientation='h')]
layout = Layout( autosize=False,
                height=1200,
               width=800)
fig = Figure(data=data, layout=layout)
iplot(fig)

In [126]:
concepts_names = [d[0] for d in trump_odds_ratio]
topk = 5
data = []
for i in range(topk):
    trace = Bar( x=[d[3][i][1]*d[1]/sum(c[1] for c in d[3]) for d in trump_odds_ratio],
               y=[d[0] for d in trump_odds_ratio],
               text=[d[3][i][0] for d in trump_odds_ratio],
               orientation='h')
    data.append(trace)
layout = Layout( autosize=False,
                height=1200,
                barmode='stack',
               width=800)
fig = Figure(data=data, layout=layout)
iplot(fig)

# Findings 

unigram 에 의한 분석을 할때는 조심해야한다. 
- sexual 에서 'violence'
- medical_emergency에서 'health'
- clothing 'white'
- appearance 'woman'
- alcohol 'lightweight'
- hipster 'looking'
- plant 'bush'

# NASDAQ 100 

1. 각각의 회사들의 twitter를 합쳐서
2. conceptvector로 만들어서 
3. clustering 할 것이다. 

In [131]:
nasdaq_files = os.listdir('./data/nasdaq/')
nasdaq_names = [n.split('_')[2] for n in nasdaq_files]

nasdaq = {}
for fname, stockname in zip(nasdaq_files, nasdaq_names):
    df = pd.read_excel('./data/nasdaq/%s' %fname, sheetname='Stream')
    df['time'] = pd.to_datetime(df['Date']+ ' ' + df['Hour'])
    df.set_index('time')
    nasdaq[stockname] = df

In [132]:
nasdaq_tweets = {}
for stock in nasdaq:
    nasdaq_tweets[stock] = ' '.join(nasdaq['disck']['Tweet content'])

In [134]:
nasdaq_lemma_tweets = {}
for h in nasdaq_tweets:
    nasdaq_lemma_tweets[h] = lemmatize_tweets(nasdaq_tweets[h])

In [139]:
topk = 5
nasdaq_vectors = {}
nasdaq_topk_words = {}
for company in nasdaq_tweets:
    nasdaq_vectors[company] = []
    nasdaq_topk_words[company] = {}
    for c in concepts:
        count, not_count, topk_words = count_concept(nasdaq_lemma_tweets[company], concepts[c], topk)
        nasdaq_vectors[company].append(count*1.0/(count+not_count))
        nasdaq_topk_words[company][c] = topk_words

In [142]:
nasdaq_vectors

{'aal': [0.005077983315197679,
  0.002176278563656148,
  0.0014508523757707653,
  0.029379760609358,
  0.0003627130939426913,
  0.018135654697134566,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.001088139281828074,
  0.00544069640914037,
  0.0,
  0.0007254261878853826,
  0.0003627130939426913,
  0.002176278563656148,
  0.0,
  0.001088139281828074,
  0.0007254261878853826,
  0.0003627130939426913,
  0.0003627130939426913,
  0.0003627130939426913,
  0.0,
  0.006891548784911135,
  0.0003627130939426913,
  0.0,
  0.0,
  0.002176278563656148,
  0.0,
  0.0003627130939426913,
  0.017410228509249184,
  0.001088139281828074,
  0.0003627130939426913,
  0.005077983315197679,
  0.0003627130939426913,
  0.0,
  0.00544069640914037,
  0.0,
  0.024301777294160318,
  0.003264417845484222,
  0.0014508523757707653,
  0.0,
  0.0,
  0.0,
  0.001088139281828074,
  0.0014508523757707653,
  0.0029017047515415306,
  0.001088139281828074,
  0.003264417845484222,
  0.0014508523757707653,
  0.01088139281828074,
