### Analyzing the World Bank's Twitter Feed, Judy Yang, DAT10 Project
### Part 4. Text Analysis

#### Tokenization, word counts
#### Prediction Linear Regression
#### Topic Modelling
#### Predict High/Low Popular Tweets
#### Term Frequency
#### Sentiment Analysis



In [203]:
pwd

u'/Users/judyyang/Documents/GA_DS_course/Final_Project/notebooks'

In [204]:
from datetime import datetime
import time
import json
import operator 
import preprocess
from collections import Counter
#from textblob import TextBlob

import pandas as pd
from pandas import ExcelWriter
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#% sign 

import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from gensim import corpora, models, similarities
from collections import defaultdict
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

from ttp import ttp

pd.options.display.max_columns = 50
pd.options.display.max_rows= 50
pd.options.display.width= 120

In [205]:
# Create excel to save outputs from this notebook
writer = ExcelWriter('./data/Project04_outputs.xlsx')

In [206]:
wb = pd.read_pickle('./data/WorldBank_all_processed_17feb_2016')

In [207]:
wb = wb[(wb.is_RT==0)]
wb.shape

(71287, 21)

In [208]:
#wb=wb[wb.user_screen_name=="WorldBank"]

In [209]:
wb['favorite75']=np.where(wb.favorite_count>=49, 1, 0)
wb['retweet75']=np.where(wb.retweet_count>=70, 1, 0)

In [210]:
wb.favorite75.describe()

count    71287.000000
mean         0.014238
std          0.118472
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: favorite75, dtype: float64

In [211]:
wb.retweet75.describe()

count    71287.000000
mean         0.014308
std          0.118760
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: retweet75, dtype: float64

In [212]:
wb = wb.reset_index()

### Tokenization

In [236]:
#Save the tweets with the top retweets and favorite counts
top_retweets=wb[(wb.retweet_count>=300)].sort_values("retweet_count", ascending=False)
top_retweets.to_excel(writer,'top_retweets')

top_fav=wb[(wb.favorite_count>=300)].sort_values("favorite_count", ascending=False)
top_fav.to_excel(writer,'top_favs')

### Predictions: what determines high retweets and favorites?

#### Linear Regression

How much of retweet or favorite responses can be explained by non-text

In [264]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

In [265]:
# Exercise :  define a function that accepts a list of features and returns testing RMSE
def train_test_rmse(cols):
   
    X = wb[cols]
    y= wb.retweet75
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)
    
    # instantiate and fit
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    # print the coefficients
    true=y_test
    pred=linreg.predict(X_test)
    rmse=np.sqrt(metrics.mean_squared_error(true, pred))
    
    return rmse

In [266]:
# compare different sets of features
#1) has_at 
print train_test_rmse(['has_at'])

#2) has a hashtag
print train_test_rmse(['has_ht'])

#3) has_at, has a hashtag, has a link, is a RT
print train_test_rmse(['has_at', 'has_ht', 'has_link'])

0.122310737948
0.122376569849
0.122311457742


Comparing testing RMSE with null RMSE
Null RMSE is the RMSE that could be achieved by always predicting the mean response value. It is a benchmark against which you may want to measure your regression model.

In [267]:
print train_test_rmse(['has_at'])

0.122310737948


In [268]:
# split the new DataFrame into training and testing sets
wb['favorite0']=np.where(wb.favorite_count>0, 1, 0)
wb['retweet0']=np.where(wb.retweet_count>0, 1, 0)

X=wb.has_at
y=wb.retweet0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [269]:
# split X and y into training and testing sets
# create a NumPy array with the same shape as y_test
y_null = np.zeros_like(y_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_test.mean())
y_null
# compute null RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_null))

0.4531526200810515

### Topic Modelling 

In [220]:
stoplist = set(CountVectorizer(stop_words='english').get_stop_words() )

In [221]:
X=wb.text_clean
texts = [[word for word in document.lower().split() if word not in stoplist] for document in list(X)]

# count up the frequency of each word
frequency = defaultdict(int)
for text in texts:
     for token in text:
         frequency[token] += 1    
        
# (2) remove words that only occur a small number of times, fixing a feature space that's needlessly big.
# once in the whole corpus, not just once in a single document

texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10, alpha = 'auto')

In [222]:
print lda.show_topics()

[u'0.017*credit + 0.016*new + 0.012*business + 0.010*online + 0.010*close + 0.009*week + 0.009*soil + 0.009*applications + 0.008*comment + 0.008*jobs', u'0.036*join + 0.035*wbg + 0.027*live + 0.024*climate + 0.020*get2equal + 0.017*change + 0.015*watch + 0.012*learning + 0.012*today + 0.011*corruption', u'0.033*worldbank + 0.029*world + 0.020*bank + 0.018*report + 0.017*new + 0.014*countries + 0.014*financialinclusion + 0.011*mt + 0.011*read + 0.010*global', u'0.027*finance + 0.017*farmers + 0.015*risk + 0.013*smes + 0.009*helping + 0.009*disasters + 0.009*development + 0.009*natural + 0.008*disaster + 0.008*resilience', u'0.031*women + 0.023*financial + 0.021*financialinclusion + 0.020*access + 0.019*finaccess + 0.014*india + 0.012*services + 0.011*help + 0.011*gender + 0.011*agriculture', u'0.026*apply + 0.025*webinar + 0.017*sign + 0.016*land + 0.015*solutions + 0.014*innovation + 0.011*prevent + 0.011*ecourse + 0.009*job + 0.009*youth', u'0.029*ecourse + 0.016*food + 0.011*course +

** Topic model only the top retweet and favorite tweets **

In [223]:
X=wb[wb.retweet75==1].text_clean

texts = [[word for word in document.lower().split() if word not in stoplist] for document in list(X)]

# count up the frequency of each word
frequency = defaultdict(int)
for text in texts:
     for token in text:
         frequency[token] += 1 
        
# remove words that only occur a small number of times, fixing a feature space that's needlessly big.
# once in the whole corpus, not just once in a single document
texts = [[token for token in text if frequency[token] > 1] for text in texts]


dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=7, alpha = 'auto')

lda.show_topics()



[u'0.016*africa + 0.014*help + 0.014*women + 0.013*poor + 0.009*poverty + 0.009*violence + 0.009*cities + 0.009*climate + 0.008*endpoverty + 0.008*new',
 u'0.018*women + 0.018*poverty + 0.014*access + 0.013*wbg + 0.013*jimkim + 0.012*ppl + 0.011*energy + 0.011*climate + 0.010*countries + 0.009*global',
 u'0.028*countries + 0.017*access + 0.014*climatechange + 0.012*poverty + 0.012*wbg + 0.010*developing + 0.010*ppl + 0.010*endpoverty + 0.009*help + 0.009*people',
 u'0.037*poverty + 0.020*wbg + 0.018*jimkim + 0.015*endpoverty + 0.013*new + 0.011*world + 0.011*women + 0.011*climate + 0.011*climatechange + 0.011*report',
 u'0.019*poverty + 0.017*cop + 0.016*climate + 0.011*global + 0.011*energy + 0.011*africa + 0.011*development + 0.010*ending + 0.009*ppl + 0.009*poor',
 u'0.018*poverty + 0.013*endpoverty + 0.013*people + 0.013*opportunity + 0.012*world + 0.012*women + 0.009*education + 0.009*cop + 0.009*change + 0.009*ending',
 u'0.015*world + 0.014*poverty + 0.012*africa + 0.011*wbg + 0

### Predict retweet or favorites

In [270]:
# split the new DataFrame into training and testing sets
wb['favorite0']=np.where(wb.favorite_count>0, 1, 0)
wb['retweet0']=np.where(wb.retweet_count>0, 1, 0)

X=wb.text_clean
y=wb.favorite0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [271]:
# use CountVectorizer to create document-term matrices from X_train and X_test
#vect = CountVectorizer()
# remove English stop words
vect = CountVectorizer(stop_words='english', lowercase=True)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [272]:
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)

0.661766356189


In [273]:
# calculate null accuracy
y_test_binary = np.where(y_test==1, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.60632925597576026

In [274]:
# null RMSE
y_null = np.zeros_like(y_test, dtype=float)

# fill the array with the mean value of y_test
y_null.fill(y_test.mean())
y_null
# compute null RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_null))

0.48856329101114554

In [262]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print 'Features: ', X_train_dtm.shape[1]
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)

In [263]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)

Features:  266521
Accuracy:  0.661429693637


In [255]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 3))
tokenize_test(vect)

Features:  31208
Accuracy:  0.648860958366


In [246]:
ddd

NameError: name 'ddd' is not defined

### Term Frequency and Inverse Document Frequency:
- **What:** Computes "relative frequency" that a word appears in a document compared to its frequency across all documents
- **Why:** More useful than "term frequency" for identifying "important" words in each document (high frequency in that document, low frequency in other documents)
- **Notes:** Used for search engine scoring, text summarization, document clustering

In [None]:
#summarize(id_select)

### Calculating the likelihood of favorite and retweet for each word/token 

In [None]:
# instantiate the vectorizer
vect = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(1,2))

# learn the vocabulary of ALL messages and save it
vect.fit(wb.text_clean)
#this is a list
all_tokens = vect.get_feature_names()
vect

In [None]:
# create separate DataFrames for high retweet and low retweet
rhi = wb[(wb.retweet_count>50)]
rlo = wb[wb.retweet_count<=50]

fhi = wb[(wb.favorite_count>50)]
flo = wb[wb.favorite_count<=50]

In [None]:
# create document-term matrices for retweet high and low
rhi_dtm = vect.transform(rhi.text_clean)
rlo_dtm = vect.transform(rlo.text_clean)

# count how many times EACH token appears across ALL retweet high/low messages
rhi_counts = np.sum(rhi_dtm.toarray(), axis=0)
rlo_counts = np.sum(rlo_dtm.toarray(), axis=0)

In [None]:
# create document-term matrices for favorite high and low
fhi_dtm = vect.transform(fhi.text_clean)
flo_dtm = vect.transform(flo.text_clean)

# count how many times EACH token appears across ALL favorite high/low messages
fhi_counts = np.sum(fhi_dtm.toarray(), axis=0)
flo_counts = np.sum(flo_dtm.toarray(), axis=0)

In [None]:
# create a DataFrame of tokens with their separate favorite high and low counts
token_counts = pd.DataFrame({'token':all_tokens, 'flo':flo_counts, 'fhi':fhi_counts, 'rlo':rlo_counts, 'rhi':rhi_counts})

# add one to retweet/favorite high and low counts to avoid dividing by zero (in the step that follows)
#pseudo counts
token_counts['rlo'] = token_counts.rlo + 1
token_counts['rhi'] = token_counts.rhi + 1
token_counts['flo'] = token_counts.flo + 1
token_counts['fhi'] = token_counts.fhi + 1

In [None]:
# calculate ratio of high-low for each token
token_counts['fav_ratio'] = token_counts.fhi / token_counts.flo
token_counts['retweet_ratio'] = token_counts.rhi / token_counts.rlo

#export to excel
#token_counts.to_excel(writer,'likelihood_tokens_textclean50')
#token_counts.sort_values("fav_ratio", ascending=False)

### Sentiment Analysis

In [None]:
# define a function that accepts text and returns the polarity
def detect_sentiment(text):
    return TextBlob(text.decode('utf-8')).sentiment.polarity

In [None]:
# create a new DataFrame column for sentiment (WARNING: SLOW!)
wb['sentiment'] = wb.text_clean.apply(detect_sentiment)

In [None]:
#More negative tweets get retweeted (Information sharing)
sns.set(style="ticks", context="talk")

# Plot tip as a function of toal bill across days
wb['ln_RT']=np.log(wb.retweet_count+1)
wb['ln_fav']=np.log(wb.favorite_count+1)
g = sns.lmplot(x="ln_fav", y="sentiment", hue="year_month", data=wb[wb.year_month=="16-01"], size=7)

# Use more informative axis labels than are provided by default
g.set_axis_labels("counts", "sentiment")

In [None]:
# list reviews with most positive sentiment
wb[wb.sentiment == 1].text.head()

In [None]:
# list reviews with most negative sentiment
wb[wb.sentiment <0].text.head()

### Gensim sentiment


http://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis


In [None]:
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load_word2vec_format('vectors.txt', binary=False) #C text format
model = Word2Vec.load_word2vec_format('vectors.bin', binary=True) #C binary format

Obtain the word vectors

In [None]:
import numpy as np

with open('food_words.txt', 'r') as infile:
    food_words = infile.readlines()

with open('sports_words.txt', 'r') as infile:
    sports_words = infile.readlines()

with open('weather_words.txt', 'r') as infile:
    weather_words = infile.readlines()

def getWordVecs(words):
    vecs = []
    for word in words:
        word = word.replace('\n', '')
        try:
            vecs.append(model[word].reshape((1,300)))
        except KeyError:
            continue
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype='float') #TSNE expects float type values

food_vecs = getWordVecs(food_words)
sports_vecs = getWordVecs(sports_words)
weather_vecs = getWordVecs(weather_words)

We can then use TSNE and matplotlib to visualize the clusters with the following code:


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

ts = TSNE(2)
reduced_vecs = ts.fit_transform(np.concatenate((food_vecs, sports_vecs, weather_vecs)))

#color points by word group to see if Word2Vec can separate them
for i in range(len(reduced_vecs)):
    if i < len(food_vecs):
        #food words colored blue
        color = 'b'
    elif i >= len(food_vecs) and i < (len(food_vecs) + len(sports_vecs)):
        #sports words colored red
        color = 'r'
    else:
        #weather words colored green
        color = 'g'
    plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8)