### Analyzing the World Bank's Twitter Feed, Judy Yang, DAT10 Project
### Part 2. Text pre-processing

In [95]:
pwd

u'/Users/judyyang/Documents/GA_DS_course/Final_Project/notebooks'

In [96]:
from datetime import datetime
import time
import json
import operator 
import preprocess
from collections import Counter
#from textblob import TextBlob

import pandas as pd
from pandas import ExcelWriter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#% sign 

from ttp import ttp

pd.options.display.max_columns = 50
pd.options.display.max_rows= 50
pd.options.display.width= 120

In [97]:
# Create excel to save outputs from this notebook
writer = ExcelWriter('./data/Project02_outputs.xlsx')

In [98]:
#input the three raw files and append
#WB[0:2]
file1 = pd.read_json('./data/WorldBank_request_01feb_2016a')
#WB[3:31]
file2 = pd.read_json('./data/WorldBank_request_14feb_2016b')
#WB[32:38]
file3 = pd.read_json('./data/WorldBank_request_14feb_2016c')

wb = pd.DataFrame()
wb=wb.append(file1)
wb=wb.append(file2)
wb=wb.append(file3)
wb = wb.reset_index()
wb.shape

(93733, 15)

In [133]:
#Check the list of WB accounts
wb.user_screen_name.value_counts()

WBG_Agriculture    3200
worldbankdata      3200
WorldBankPSD       3200
WorldBankSAsia     3200
WBG_Cities         3200
WBG_Finance        3200
wbg_gov            3200
WBG_Environment    3200
WBG_Gender         3200
WorldBank_IEG      3200
WBG_Education      3200
WorldBankWater     3200
WBPubs             3200
WBG_Poverty        3200
WBG_Climate        3200
WBG_LeadINLearn    3200
IFC_org            3200
WorldBankAfrica    3200
WorldBankSEI       3200
WBG_Health         3200
wb_research        3199
WorldBank          3196
WBG_PPP            2961
WorldBankVideos    2725
MIGAWorldBank      2000
WorldBankIndia     1907
WBG_SPLabor        1810
WorldBankKenya     1707
WBG_Energy         1695
WBCaribbean        1597
WBG_Transport      1508
WorldBankPhotos    1259
WBOpenFinances     1123
WorldBankECA        808
impacteval          745
WBG_Dev4Peace       649
WBedutech           508
JimKim_WBG          336
Name: user_screen_name, dtype: int64

In [100]:
wb.user_name.describe()

count                93733
unique                  38
top       World Bank Water
freq                  3200
Name: user_name, dtype: object

In [101]:
#Check the columns
wb.columns

Index([               u'index',           u'created_at',       u'favorite_count',                   u'id',
              u'retweet_count',               u'source',                 u'text',      u'user_created_at',
           u'user_description', u'user_followers_count',   u'user_friends_count',              u'user_id',
              u'user_location',            u'user_name',     u'user_screen_name'],
      dtype='object')

In [102]:
#Keep select columns for text analysis
keepcols=['id', 'user_screen_name', 'created_at', 'favorite_count', 'retweet_count', 'text']
wb=wb[keepcols]

In [103]:
wb.head()

Unnamed: 0,id,user_screen_name,created_at,favorite_count,retweet_count,text
0,694299339649347584,WBG_Poverty,2016-02-01 23:20:36,1,1,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr
1,694250314795454467,WBG_Poverty,2016-02-01 20:05:48,0,0,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK
2,693934929642659840,WBG_Poverty,2016-01-31 23:12:34,0,77,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…
3,689889192038514688,WBG_Poverty,2016-01-20 19:16:15,0,131,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…
4,643911101600632833,WBG_Poverty,2015-09-15 22:15:44,0,0,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS


In [104]:
wb.text.describe()

count                                                                                                                                     93733
unique                                                                                                                                    90547
top       #Socent create marketplaces to bring skills and arts of even the most isolated to the world. Check these out | http://t.co/ADYdNFOImw
freq                                                                                                                                         20
Name: text, dtype: object

In [105]:
#Q: what is unicode exactly?
wb.text.values

array([ u"Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr",
       u'Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK',
       u'RT @WorldBank: It\u2019s easy to be pessimistic about the future. Let your passion drive you. Don\u2019t give up or accept the norm. -Sri Mulyani #Al\u2026',
       ...,
       u'In #Tunisia #IFC support key for #microfinance bank to manage risk during Arab Spring. Story  #A2F13 http://t.co/sKkZeypNm3 @CGAP @IFC_MENA',
       u'#IFC mobile banking products help w/ access to finance for low-income population in #Colombia - #A2F13 http://t.co/njQTKfA1xe - @IFC_LAC',
       u'In #Ethiopia modernized credit registry allows for more financing to entrepreneurs, individuals   #A2F13 http://t.co/njQTKfA1xe @IFCAfrica'], dtype=object)

**Create data vectors**

Make indicator if there is an URL, if there is a RT, if @, and # of words

In [106]:
#text length (should be 0-140 characters)
wb['length'] = wb.text.apply(len)

In [107]:
#Is a RT
sub="RT"
wb['is_RT']=wb.text.str.contains(sub)
wb.is_RT.value_counts()

False    71287
True     22446
Name: is_RT, dtype: int64

In [108]:
# has a hashtag
sub="#"
wb['has_ht']=wb.text.str.contains(sub)
wb.has_ht.value_counts()

True     76759
False    16974
Name: has_ht, dtype: int64

In [109]:
#has a link
sub="http"
wb['has_link']=wb.text.str.contains(sub)
wb.has_link.value_counts()

True     85688
False     8045
Name: has_link, dtype: int64

In [110]:
#has an @
sub="@"
wb['has_at']=wb.text.str.contains(sub)
wb.has_at.value_counts()


False    48793
True     44940
Name: has_at, dtype: int64

**Create lists of text**

https://github.com/edburnett/twitter-text-python

In [111]:
def parse_tags(tweet):
    p=ttp.Parser()   
    results = p.parse(tweet)
    return " ".join(results.tags)

def parse_users(tweet):
    p=ttp.Parser()   
    results = p.parse(tweet)
    return " ".join(results.users)

In [112]:
#create column of hashtags
#create column of links

wb['tags'] = wb.text.apply(parse_tags)
wb['users'] = wb.text.apply(parse_users)
wb.head(2)

Unnamed: 0,id,user_screen_name,created_at,favorite_count,retweet_count,text,length,is_RT,has_ht,has_link,has_at,tags,users
0,694299339649347584,WBG_Poverty,2016-02-01 23:20:36,1,1,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr,132,False,True,True,False,inequality,
1,694250314795454467,WBG_Poverty,2016-02-01 20:05:48,0,0,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK,136,False,True,True,False,emergingmarket commodityprices poverty,


In [113]:
#Q: how to create all the tags together, some are one tag, some are 2 tags
wb.tags.value_counts(ascending=False).head()

           17128
PPPs         786
Africa       714
ecourse      675
goodgov      624
Name: tags, dtype: int64

In [114]:
wb.users.value_counts(ascending=False).head()

                   49197
WorldBank           5084
JimKim_WBG          1026
IFC_org              545
WorldBankAfrica      409
Name: users, dtype: int64

**Text clean up**

http://www.analyticsvidhya.com/blog/2014/11/text-data-cleaning-steps-python/

1) Escape HTML characters
2) Decode data
3) Apostrophe lookup
4) Remove stop words
5) Remove punctuation
6) Remove expressions
7) Split attached words
8) Slang lookup
9) Standardizing words
10) Remove URLs

In [115]:
#create clean text for text analysis: remove URL, remove #, remove @, just words
import HTMLParser
import re
html_parser = HTMLParser.HTMLParser()
wb['text_clean'] = html_parser.unescape(wb.text)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.max_rows', 200)

cols =['text']
wb[cols]

Unnamed: 0,text
0,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr
1,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK
2,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…
3,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…
4,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS
5,Women improve #financialinclusion of the poorest in rural #India. Watch their video story: http://t.co/e9VySOIXw0 http://t.co/CDWNyKlNX2
6,RT @nextgenderation: We want to reduce #violence in #Jamaica. Can #socialmedia help reach teens? Have YOUR say #setthetrend Twitter chat Th…
7,Rapid economic growth between 2002 &amp; 2008 helped reduce #poverty in #Albania by half: http://t.co/skY16glKrS http://t.co/EGQG9oYCdo
8,Laws in close to 155 economies make it difficult for #women to realize full economic potential http://t.co/vY3CsIfSbN #WomenBizLaw
9,Equity &amp; learning will help #endextremepoverty by 2030: http://t.co/1B1nlYaGb7


In [116]:
#Remove htmls
wb['text_clean'] = wb.text.str.replace("([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ")
#check original and cleaned text
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.max_rows', 100)

cols =['text', 'text_clean']
wb[cols]

Unnamed: 0,text,text_clean
0,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr,Eliminating inequality is not only necessary for realizing one s potential but also for living with dignity
1,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK,Faster than expected emergingmarket slowdown could lower commodityprices Will this impact poverty reduction
2,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…,RT WorldBank It s easy to be pessimistic about the future Let your passion drive you Don t give up or accept the norm Sri Mulyani Al
3,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…,RT WorldBank Data deprivation makes it difficult to set policies benefiting the poor Let s fix this
4,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS,More efficient energy sources can provide more electricity to larger numbers of households amp businesses
5,Women improve #financialinclusion of the poorest in rural #India. Watch their video story: http://t.co/e9VySOIXw0 http://t.co/CDWNyKlNX2,Women improve financialinclusion of the poorest in rural India Watch their video story
6,RT @nextgenderation: We want to reduce #violence in #Jamaica. Can #socialmedia help reach teens? Have YOUR say #setthetrend Twitter chat Th…,RT nextgenderation We want to reduce violence in Jamaica Can socialmedia help reach teens Have YOUR say setthetrend Twitter chat Th
7,Rapid economic growth between 2002 &amp; 2008 helped reduce #poverty in #Albania by half: http://t.co/skY16glKrS http://t.co/EGQG9oYCdo,Rapid economic growth between 2002 amp 2008 helped reduce poverty in Albania by half
8,Laws in close to 155 economies make it difficult for #women to realize full economic potential http://t.co/vY3CsIfSbN #WomenBizLaw,Laws in close to 155 economies make it difficult for women to realize full economic potential WomenBizLaw
9,Equity &amp; learning will help #endextremepoverty by 2030: http://t.co/1B1nlYaGb7,Equity amp learning will help endextremepoverty by 2030


In [117]:
#Remove word amp
wb['text_clean']=wb.text_clean.str.replace("amp","")
#check original and cleaned text
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.max_rows', 100)

cols =['text', 'text_clean']
wb[cols]

Unnamed: 0,text,text_clean
0,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr,Eliminating inequality is not only necessary for realizing one s potential but also for living with dignity
1,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK,Faster than expected emergingmarket slowdown could lower commodityprices Will this impact poverty reduction
2,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…,RT WorldBank It s easy to be pessimistic about the future Let your passion drive you Don t give up or accept the norm Sri Mulyani Al
3,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…,RT WorldBank Data deprivation makes it difficult to set policies benefiting the poor Let s fix this
4,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS,More efficient energy sources can provide more electricity to larger numbers of households businesses
5,Women improve #financialinclusion of the poorest in rural #India. Watch their video story: http://t.co/e9VySOIXw0 http://t.co/CDWNyKlNX2,Women improve financialinclusion of the poorest in rural India Watch their video story
6,RT @nextgenderation: We want to reduce #violence in #Jamaica. Can #socialmedia help reach teens? Have YOUR say #setthetrend Twitter chat Th…,RT nextgenderation We want to reduce violence in Jamaica Can socialmedia help reach teens Have YOUR say setthetrend Twitter chat Th
7,Rapid economic growth between 2002 &amp; 2008 helped reduce #poverty in #Albania by half: http://t.co/skY16glKrS http://t.co/EGQG9oYCdo,Rapid economic growth between 2002 2008 helped reduce poverty in Albania by half
8,Laws in close to 155 economies make it difficult for #women to realize full economic potential http://t.co/vY3CsIfSbN #WomenBizLaw,Laws in close to 155 economies make it difficult for women to realize full economic potential WomenBizLaw
9,Equity &amp; learning will help #endextremepoverty by 2030: http://t.co/1B1nlYaGb7,Equity learning will help endextremepoverty by 2030


In [118]:
#Remove single letters
wb['text_clean']=wb.text_clean.str.replace(" [A-Za-z] ","")
wb[cols]

Unnamed: 0,text,text_clean
0,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr,Eliminating inequality is not only necessary for realizing onepotential but also for living with dignity
1,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK,Faster than expected emergingmarket slowdown could lower commodityprices Will this impact poverty reduction
2,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…,RT WorldBank Iteasy to be pessimistic about the future Let your passion drive you Dongive up or accept the norm Sri Mulyani Al
3,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…,RT WorldBank Data deprivation makes it difficult to set policies benefiting the poor Letfix this
4,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS,More efficient energy sources can provide more electricity to larger numbers of households businesses
5,Women improve #financialinclusion of the poorest in rural #India. Watch their video story: http://t.co/e9VySOIXw0 http://t.co/CDWNyKlNX2,Women improve financialinclusion of the poorest in rural India Watch their video story
6,RT @nextgenderation: We want to reduce #violence in #Jamaica. Can #socialmedia help reach teens? Have YOUR say #setthetrend Twitter chat Th…,RT nextgenderation We want to reduce violence in Jamaica Can socialmedia help reach teens Have YOUR say setthetrend Twitter chat Th
7,Rapid economic growth between 2002 &amp; 2008 helped reduce #poverty in #Albania by half: http://t.co/skY16glKrS http://t.co/EGQG9oYCdo,Rapid economic growth between 2002 2008 helped reduce poverty in Albania by half
8,Laws in close to 155 economies make it difficult for #women to realize full economic potential http://t.co/vY3CsIfSbN #WomenBizLaw,Laws in close to 155 economies make it difficult for women to realize full economic potential WomenBizLaw
9,Equity &amp; learning will help #endextremepoverty by 2030: http://t.co/1B1nlYaGb7,Equity learning will help endextremepoverty by 2030


In [119]:
#remove all numbers unless its part of a word
wb['text_clean']=wb.text_clean.str.replace("[0-9][0-9][0-9][0-9][0-9][0-9]\s","")
wb['text_clean']=wb.text_clean.str.replace("[0-9][0-9][0-9][0-9][0-9]\s","")
wb['text_clean']=wb.text_clean.str.replace("[0-9][0-9][0-9][0-9]\s","")
wb['text_clean']=wb.text_clean.str.replace("[0-9][0-9][0-9]\s","")
wb['text_clean']=wb.text_clean.str.replace("[0-9][0-9]\s","")
wb['text_clean']=wb.text_clean.str.replace("[0-9]\s","")

wb[cols]

Unnamed: 0,text,text_clean
0,Eliminating #inequality is not only necessary for realizing one's potential but also for living with dignity https://t.co/1DAIhpnbRr,Eliminating inequality is not only necessary for realizing onepotential but also for living with dignity
1,Faster-than-expected #emergingmarket slowdown could lower #commodityprices. Will this impact #poverty reduction? https://t.co/IjTj2MXvXK,Faster than expected emergingmarket slowdown could lower commodityprices Will this impact poverty reduction
2,RT @WorldBank: It’s easy to be pessimistic about the future. Let your passion drive you. Don’t give up or accept the norm. -Sri Mulyani #Al…,RT WorldBank Iteasy to be pessimistic about the future Let your passion drive you Dongive up or accept the norm Sri Mulyani Al
3,RT @WorldBank: “Data deprivation” makes it difficult to set policies benefiting the poor. Let's fix this: https://t.co/Fugfn4YEGk https://t…,RT WorldBank Data deprivation makes it difficult to set policies benefiting the poor Letfix this
4,More efficient energy sources can provide more electricity to larger numbers of households &amp; businesses: http://t.co/x0NnAyKDcS,More efficient energy sources can provide more electricity to larger numbers of households businesses
5,Women improve #financialinclusion of the poorest in rural #India. Watch their video story: http://t.co/e9VySOIXw0 http://t.co/CDWNyKlNX2,Women improve financialinclusion of the poorest in rural India Watch their video story
6,RT @nextgenderation: We want to reduce #violence in #Jamaica. Can #socialmedia help reach teens? Have YOUR say #setthetrend Twitter chat Th…,RT nextgenderation We want to reduce violence in Jamaica Can socialmedia help reach teens Have YOUR say setthetrend Twitter chat Th
7,Rapid economic growth between 2002 &amp; 2008 helped reduce #poverty in #Albania by half: http://t.co/skY16glKrS http://t.co/EGQG9oYCdo,Rapid economic growth between helped reduce poverty in Albania by half
8,Laws in close to 155 economies make it difficult for #women to realize full economic potential http://t.co/vY3CsIfSbN #WomenBizLaw,Laws in close to economies make it difficult for women to realize full economic potential WomenBizLaw
9,Equity &amp; learning will help #endextremepoverty by 2030: http://t.co/1B1nlYaGb7,Equity learning will help endextremepoverty by


** Create some more variables **

In [120]:
# 1) Create dependent variables
wb['has_retweet']= np.where(wb.retweet_count>1, 1, 0)
wb['has_favorite']= np.where(wb.favorite_count>1, 1, 0)

In [121]:
# 2) Create dummy variables for each account
#Iuser = pd.get_dummies(wb.user_screen_name, prefix='Iuser')
# concatenate the original DataFrame and the dummy DataFrame
#wb = pd.concat([wb, Iuser], axis=1)

In [130]:
#3) Create time columns and dummies, and append
wb['year']=wb.created_at.apply(lambda x: x.strftime('%y'))
wb['year_cat'] = wb.year.astype('category', categories=["09", "10", "11" ,"12", "13", "14" , "15", "16"], ordered=True)
wb['year_month']=wb.created_at.apply(lambda x: x.strftime('%y-%m'))
wb['hour']=wb.created_at.apply(lambda x: x.strftime('%H'))

In [132]:
cols =['year', 'created_at',  'hour']
wb[cols]

Unnamed: 0,year,created_at,hour
0,16,2016-02-01 23:20:36,23
1,16,2016-02-01 20:05:48,20
2,16,2016-01-31 23:12:34,23
3,16,2016-01-20 19:16:15,19
4,15,2015-09-15 22:15:44,22
5,15,2015-09-15 21:05:53,21
6,15,2015-09-15 20:24:19,20
7,15,2015-09-15 17:40:51,17
8,15,2015-09-15 16:10:57,16
9,15,2015-09-15 12:15:49,12


In [126]:
#ratio of favorite/tweet
wb['fav_RT_ratio']=wb.favorite_count/wb.retweet_count

## Save cleaned data

In [127]:
wb.to_pickle('./data/WorldBank_all_processed_17feb_2016')