# Data Cleaning - Women's March 2017 dataset

In [1]:
%matplotlib inline

import re
import pandas as pd
from pandas.io.json import json_normalize

`<remember to explain here>`

In [9]:
# Helper functions
def get_hashtags(text):
    s = re.findall('(?:^|\s)[＃#]{1}(\w+)', text)
    return s if len(s) > 0 else ''

def get_mentions(text):
    s = re.findall('(?:^|\s?|\.?)[＠@]{1}([^\s#<>[\]|{}:;,.\(\)=+]+)', text)
    return s if len(s) > 0 else ''

def get_source(text):
    s = re.findall('<a\s+?href=\"[^\"]*\"\s+?rel=\"[^\"]*\">([^<>]+)<\/a>', text)
    return s[0] if len(s) > 0 else ''

def get_urls(text):
    s = re.findall('http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', text)
    return s[0] if len(s) > 0 else ''

In [10]:
filename = '../data/wm2017_tweets.json'

json_reader = pd.read_json(filename, lines=True, chunksize=1024)

wm_data = pd.DataFrame()
    
for chunk in json_reader:
    not_truncated = chunk[chunk['truncated'] == False]
    only_english = not_truncated[not_truncated['lang'] == 'en'].reset_index()
    
    only_english['hashtags'] = only_english['text'].apply(get_hashtags)
    only_english['mentions'] = only_english['text'].apply(get_mentions)
    only_english['urls'] = only_english['text'].apply(get_urls)
    only_english['source'] = only_english['source'].apply(get_source)
    user_df = json_normalize(only_english['user'])

    # Selecting only few columns
    tweet_df = only_english[['id_str', 'created_at', 'source', 'text', 'hashtags', 'mentions', \
                             'urls', 'favorite_count', 'retweet_count']]
    user_df = user_df[['location', 'name', 'screen_name', 'followers_count', 'verified']]
    
    frames = [tweet_df, user_df]
    
    df = pd.concat(frames, axis=1)

    wm_data = wm_data.append(df)

In [11]:
wm_data.shape

(553117, 14)

In [12]:
wm_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 553117 entries, 0 to 432
Data columns (total 14 columns):
id_str             553117 non-null int64
created_at         553117 non-null datetime64[ns]
source             553117 non-null object
text               553117 non-null object
hashtags           553117 non-null object
mentions           553117 non-null object
urls               553117 non-null object
favorite_count     553117 non-null int64
retweet_count      553117 non-null int64
location           553117 non-null object
name               553117 non-null object
screen_name        553117 non-null object
followers_count    553117 non-null int64
verified           553117 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(4), object(8)
memory usage: 59.6+ MB


In [13]:
with open('../data/clean_wm2017tweets.json', 'w') as f:
    f.write(wm_data.to_json(orient='records', lines=True))

# Data Wrangling

In [15]:
wm_data = pd.read_json('../data/clean_wm2017tweets.json', lines=True)

In [19]:
wm_data.head()

Unnamed: 0,created_at,favorite_count,followers_count,hashtags,id_str,location,mentions,name,retweet_count,screen_name,source,text,urls,verified
0,2017-01-21 05:03:04,0,128,[WomensMarch],822670827871092736,"Brentwood, CA",[katyperry],Jesmarie,5015,jesmarie_avila,Twitter for iPhone,"RT @katyperry: Then, I'm marching. #WomensMarch",,False
1,2017-01-21 05:01:15,0,4092,"[amplify, womensmarch, whyimarch, wmwcanada, s...",822670369425358848,"Richmond, KY",[CdnWomenMarch],Zezrie💕,208,zezrie,Twitter for Android,RT @CdnWomenMarch: What does it mean to suppor...,https://t.co/DrBsA4B4,False
2,2017-01-21 05:01:41,0,974,"[womensmarch, womensmarchsydney, wmw, wmglobal]",822670479735529472,"Zwolle, Ov, the Netherlands",[womensmarchsyd],Miriam Martina Dion,31,MiriamMartina2,Twitter for Android,RT @womensmarchsyd: Six days left! Who's with ...,https://t.co/XcrxWFsfxF,False
3,2017-01-21 05:01:23,0,4605,,822670402153422848,"iPhone: 41.362907,-70.516632",[latimes],Susan Saint James,177,Saintjames46,Twitter for iPhone,RT @latimes: Headed to the Women's March in L....,https://t.co/usOG7ThkVM,False
4,2017-01-21 05:01:57,1,233,"[womensmarch, dc, firedup, readytogo]",822670548085968896,"Baltimore, Maryland",,Joules,0,madamejoules,Instagram,I'm ready to march.....\n\n#womensmarch #dc #f...,https://t.co/i5TxiB2iGa,False
