In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
import re
%matplotlib inline

In [69]:
from mpi4py import MPI

## 1. Read in the tweets data file

In [70]:
with open('./data/tinyTwitter.json') as tw:
    tweets = tw.read()
    tweets_json = json.loads(tweets)

In [71]:
tweets_df= json_normalize(tweets_json['rows'])
tweets_df_info = pd.concat([tweets_df['value.properties.text'].apply(lambda text: text.lower()),tweets_df['doc.coordinates.coordinates']],axis=1)
tweets_df_info.columns = ['Text','Coordinates']

In [72]:
tweets_df_info[['lat','long']] = pd.DataFrame(tweets_df_info['Coordinates'].tolist(), index=tweets_df_info.index)

In [74]:
tweets_df_info.head()
len(tweets_df_info)

19

In [49]:
tweets_df_info.iloc()

<pandas.core.indexing._iLocIndexer at 0x7fa242410350>

## 2. Read in the box cooredinates into a dataframe

In [50]:
with open('./data/melbGrid.json') as f_m_grid:
    mgrid = f_m_grid.read()
    mgrid_json = json.loads(mgrid)
    mgrid_df = json_normalize(mgrid_json['features'])

In [51]:
mgrid_df_lite = pd.concat([mgrid_df['properties.id'], mgrid_df['properties.xmin'], mgrid_df['properties.xmax'], mgrid_df['properties.ymin'], mgrid_df['properties.ymax']],axis=1)
mgrid_df_lite.columns = ['id','xmin','xmax','ymin','ymax']
mgrid_df_lite=mgrid_df_lite.set_index('id')

In [52]:
mgrid_df_lite

Unnamed: 0_level_0,xmin,xmax,ymin,ymax
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1,144.7,144.85,-37.65,-37.5
A2,144.85,145.0,-37.65,-37.5
A3,145.0,145.15,-37.65,-37.5
A4,145.15,145.3,-37.65,-37.5
B1,144.7,144.85,-37.8,-37.65
B2,144.85,145.0,-37.8,-37.65
B3,145.0,145.15,-37.8,-37.65
B4,145.15,145.3,-37.8,-37.65
C1,144.7,144.85,-37.95,-37.8
C2,144.85,145.0,-37.95,-37.8


## 3. Define the function which determins the position of a data point

In [53]:
def get_point_region(coord,mgrid_df):
    allocated_region = None
    for id,region in mgrid_df.iterrows(): 
        left_margin = coord[0]-region.xmin
        right_margin = coord[0]-region.xmax
        top_margin = coord[1]-region.ymax
        bottom_margin = coord[1]-region.ymin
        if((left_margin > 0 and right_margin <= 0) and (top_margin <= 0 and bottom_margin > 0)):
            allocated_region = id
            break
    return allocated_region   

In [54]:
tweets_df_info['region'] = tweets_df_info['Coordinates'].apply(get_point_region,mgrid_df=mgrid_df_lite)

In [55]:
tweets_df_info

Unnamed: 0,Text,Coordinates,lat,long,region
0,"i'm at @rivastkilda in st kilda, vic https://t...","[144.97481346, -37.87559865]",144.974813,-37.875599,C2
1,good guidance for all you ideas people coming ...,"[144.97490982, -37.87546411]",144.974910,-37.875464,C2
2,my neighbours no matter where i live!! 💖� alis...,"[144.97492373, -37.8754259]",144.974924,-37.875426,C2
3,😎�😎�😎�👯� @ riva st.kilda marina http://t.co/e6...,"[144.97492373, -37.8754259]",144.974924,-37.875426,C2
4,pretty awesome day 1 @ riva st kilda https://t...,"[144.97490908, -37.87539256]",144.974909,-37.875393,C2
...,...,...,...,...,...
9741,my classmates and i have done a good arrangeme...,"[144.97549, -37.83737]",144.975490,-37.837370,C2
9742,selamat hari raya aidiladha (@ malaysian consu...,"[144.97550286, -37.83742225]",144.975503,-37.837422,C2
9743,i'm at malaysian consulate general in melbourn...,"[144.97550286, -37.83742225]",144.975503,-37.837422,C2
9744,underwater rugby: grrrls training session. htt...,"[144.975159, -37.836926]",144.975159,-37.836926,C2


* **clean up tweet text with regex**

In [56]:
pattern = re.compile(r'(?:^|\s)([A-Za-z\'\.\’]+[A-Za-z])(?=\s|\Z)')
tweets_df_info['Text'] = tweets_df_info['Text'].apply(lambda text: " ".join(pattern.findall(text)))

In [57]:
tweets_df_info

Unnamed: 0,Text,Coordinates,lat,long,region
0,i'm at in st vic,"[144.97481346, -37.87559865]",144.974813,-37.875599,C2
1,good guidance for all you ideas people coming ...,"[144.97490982, -37.87546411]",144.974910,-37.875464,C2
2,my neighbours no matter where alisonhardi riva...,"[144.97492373, -37.8754259]",144.974924,-37.875426,C2
3,riva st.kilda marina,"[144.97492373, -37.8754259]",144.974924,-37.875426,C2
4,pretty awesome day riva st kilda,"[144.97490908, -37.87539256]",144.974909,-37.875393,C2
...,...,...,...,...,...
9741,my classmates and have done good arrangement t...,"[144.97549, -37.83737]",144.975490,-37.837370,C2
9742,selamat hari raya aidiladha malaysian consulat...,"[144.97550286, -37.83742225]",144.975503,-37.837422,C2
9743,i'm at malaysian consulate general in vic,"[144.97550286, -37.83742225]",144.975503,-37.837422,C2
9744,underwater grrrls training,"[144.975159, -37.836926]",144.975159,-37.836926,C2


## 4. Read in the sentimental scroe guideline file

In [58]:
sentiment_guide = pd.read_table('./data/AFINN.txt',names=['word','score'],index_col='word')

In [59]:
sentiment_guide.head()
sentiment_guide.at["can't stand",'score']

-3

## 5. Calculate the sentiment score of a tweet
* use index to search the score value will be 3x faster

### 5.1 Define the function to calculate total sentiment score of given text

In [60]:
def get_sentiment_score(tweet):
    score = 0
    if "can't stand" in tweet: score += sentiment_guide.at["can't stand",'score']
    for token in tweet.split():
        score += sentiment_guide.at[token,'score'] if token in sentiment_guide.index else 0
    return score

In [61]:
tweets_df_info['sentiment'] = tweets_df_info['Text'].apply(get_sentiment_score)

In [62]:
tweets_df_info[tweets_df_info.apply(lambda df: df.index%4 == 1)]

Unnamed: 0,Text,Coordinates,lat,long,region,sentiment
0,,,,,,
1,good guidance for all you ideas people coming ...,"[144.97490982, -37.87546411]",144.974910,-37.875464,C2,3.0
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
9741,my classmates and have done good arrangement t...,"[144.97549, -37.83737]",144.975490,-37.837370,C2,6.0
9742,,,,,,
9743,,,,,,
9744,,,,,,


## 6. Calculate the Happy Region Ranking

In [63]:
region_counts = tweets_df_info.groupby('region').count()
region_counts = region_counts.drop(columns=['lat','long','sentiment','Coordinates'])
region_counts.columns = ['#Total Tweets']

In [64]:
region_score = tweets_df_info.groupby('region').sum()
region_score = region_score.drop(columns=['lat','long'])
region_score.columns = ['#Overall Sentiment Score']

In [65]:
result = region_counts.join(region_score,on="region").sort_values('region')

Unnamed: 0_level_0,#Total Tweets,#Overall Sentiment Score
region,Unnamed: 1_level_1,Unnamed: 2_level_1
C2,9746,9068


## Testing Blocks

In [66]:
text = "good guidance for all you ideas people coming..."
pattern = re.compile(r'(?:^|\s)([A-Za-z\'\.\’]+[A-Za-z])(?=\b|\Z)')
pattern.findall(text)

['good', 'guidance', 'for', 'all', 'you', 'ideas', 'people', 'coming']

In [67]:
1%0

ZeroDivisionError: integer division or modulo by zero

In [None]:
1%4

In [None]:
tweets_df_info[tweets_df_info.apply(lambda df: df.index%4 == 2)].dropna()

In [None]:
tweets_list= tweets_df_info.values.tolist()
tweets_list