This notebook shows different analyses attempted to analyse the content of the Facebook pages.
* Analyse content from the posts
    * What different is there between PUK and not PUK
* Find most "succesful" posts
    * at what time?
    * what content

In [1]:
from __future__ import division
%matplotlib inline
import json
import nltk
from nltk import bigrams
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.text import TokenSearcher
import string
import io
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1- Clean the data

We start with removing the punctuation, and separate word for word to remove the common words in the English language. Also replace "parkinson's uk" as one word to be able to separate it from "parkinson's". Correct some obvious mispelling in common words that appeared many times.

In [3]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['https','http','org',u'“',u'’',u'–','www']
dic_replace = {"parkinson u'\u2019' s uk ":'parkinsonsuk ',"parkinson u'\u2019' s":'parkinsons',
               "parkinson's uk ":'parkinsonsuk ',"parkinson's":'parkinsons',
               'carers ':'carer ','thank you ':'thanks','weeks ':'week',
               'treatmen ':'treatment','sympto ':'symptoms','symptom ':'symptoms',
               'dads ':'dad', 'mums':'mum'}
 
def tokenize(s):
    return nltk.word_tokenize(s)
 
def preprocess(s):
    s = s.lower()
    for w in dic_replace:
        s = s.replace(w,dic_replace[w])
    tokens = nltk.word_tokenize(s)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return tokens

def lightclean(s):
    s.lower()
    for w in dic_replace:
        s = s.replace(w,dic_replace[w])
    for p in punctuation:
        s = s.replace(p,'')
    return s

def cleantext(fname,analysis_name):
    error = 0
    with open(fname, 'r') as f:
        count_stop = Counter()
        count_bigram = Counter()
        for line in f:
            posts = json.loads('{}'.format(line))
            for post in posts:
                try:
                    terms_stop = [term for term in preprocess(post['content']) 
                                  if term not in stop]
                    terms_bigram = bigrams(terms_stop)
                    terms = [term for term in preprocess(post['content'])
                             if term not in stop and len(term) != 1]
                except:
                    error += 1
                count_stop.update(terms_stop)
                count_bigram.update(terms_bigram)

    nElements = 50
    with open('bigrams_'+analysis_name+'.txt', 'w') as f:
        f.write(str(count_bigram.most_common(nElements)))
    word_freq = count_stop.most_common(nElements)
    # Export the word frequency to json
    with io.open('wordfreq_'+analysis_name+'.json', 'w', encoding='utf-8') as f:
        f.write(unicode(json.dumps(word_freq, ensure_ascii=False, encoding='utf8')))
    print('done!')

# Text cleaning

There are three datasets here: 
* all the posts, 
* all the posts from Parkinson's UK (PUK), 
* and all the posts not from Parkinson's UK.

In [4]:
cleantext('posts.json','all')

FileNotFoundError: [Errno 2] No such file or directory: 'posts.json'

In [None]:
cleantext('posts_puk.json','puk')

In [None]:
cleantext('posts_notpuk.json','notpuk')

## Compare word used: PUK and not PUK

We expect that Parkinson's UK (PUK) and their readers (not Puk) will have used different words in their text. This reflects their difference of interest but also of terminology.

In [None]:
with io.open('posts_puk.json',encoding='utf-8') as f_puk, io.open('posts_notpuk.json',encoding='utf-8') as f_notpuk:
    posts_puk = json.loads(f_puk.read(), encoding='utf8')
    posts_notpuk = json.loads(f_notpuk.read(), encoding='utf8')
print 'PUK wrote', len(posts_puk), 'posts'
print 'Not PUK wrote', len(posts_notpuk), 'posts.'
authors = []
content_puk = []
content_notpuk = []
for i in range(len(posts_notpuk)):
    authors.append(posts_notpuk[i]['person_hash_id'])
    content_notpuk.append(lightclean(posts_notpuk[i]['content']))
for i in range(len(posts_puk)):
    content_puk.append(lightclean(posts_puk[i]['content']))
authors = set(authors)
print 'Not PUK posts were written by', len(authors), 'authors. So', round(len(posts_notpuk)/len(authors),1), 'posts per authors'


We prepare the data to make a bar chart of both 50 most common words in PUK and not PUK posts.

In [None]:
with open('wordfreq_puk.json', 'r') as fpuk, open('wordfreq_notpuk.json', 'r') as fnotpuk:
    words_puk = json.load(fpuk)
    words_notpuk = json.load(fnotpuk)
freqwords_all = []
for word in words_puk:
    freqwords_all.append(word[0])
for word in words_notpuk:
    freqwords_all.append(word[0])
# contains the list of 50 freq words between PUK and not PUK
freqwords_all = list(set(freqwords_all)) 
# prepare data for the bar chart
df_puk = pd.DataFrame(words_puk,columns=['word','PUK'])
df_notpuk = pd.DataFrame(words_notpuk,columns=['word','NotPUK'])
df = pd.merge(df_puk,df_notpuk,on='word',how='outer')
df['diff'] = (df['PUK'] - df['NotPUK']).fillna(0)
df = df.sort_values(['diff'])
df_plot = pd.melt(df,id_vars=['word'],value_vars=['PUK','NotPUK'])
df_plot = df_plot.values

We now make a chart of the 50 most used words by Parkinson's UK and their readers. The following barchart shows words grouped together, one bar per writer type. 

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
space = 0.3
authorcat = np.unique(df_plot[:,1])
freqwords = np.unique(df_plot[:,0])
n = len(authorcat)
width = (1 - space) / (len(authorcat))
colors = ['b','r']
for i,word in enumerate(authorcat):
    allwords = df_plot[df_plot[:,1] == word][:,0]
    indices = range(1, len(freqwords)+1)
    vals = df_plot[df_plot[:,1] == word][:,2].astype(np.float)
    pos = [j - (1 - space) / 2. + i * width for j in range(1,len(freqwords)+1)]
    ax.bar(pos, vals, width=width, label=word, 
               color = colors[i])
ax.set_xticks(indices)
ax.set_xticklabels(allwords)
plt.setp(plt.xticks()[1], rotation=90)
ax.set_ylabel("Frequency")
ax.set_xlabel("Words")
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], loc='upper left')
plt.savefig('Word_Frequency_pukornot.png')
plt.show()

## Quick analysis
The words with only a red bar in are only frequent in Parkinson's UK posts, while the blue bar only are only frequent in readers of Parkinson's UK. 
* Readers posting on Parkinson's UK page express themselves differently: they use polite words such as 'Hi', 'tanks', or 'please'. 
* They refer to the Parkinson's condition as a 'disease' or 'pd' (for Parkinson's disease), while Parkinson's UK use the word 'condition'. 
* Parkibson's UK only speaks frequently about 'diagnosis', while readers speak of 'diagnosed'
* Regarding their use of indefinite pronouns, Parkinson's UK uses most frequently 'something' (along with the word 'things') while readers of Parkinson's UK uses 'anyone'.
* Both use similarly 'raise', 'support', 'awareness'

## We define four clusters:
* The words frequent only in PUK's text
* The words frequent only in not PUK's text
* The words frequent for both, more often in PUK
* The words frequent for both, more often in not PUK

We first look at all the text from either author type.

In [None]:
df_pukonly = df[df['NotPUK'].isnull()]
df_notpukonly = df[df['PUK'].isnull()]
df_morepuk = df.ix[(~df['NotPUK'].isnull() | ~df['PUK'].isnull())]#&df['PUK']>df['NotPUK']]
df_morenotpuk = df.ix[(~df['NotPUK'].isnull() | ~df['PUK'].isnull())]#&df['PUK']<df['NotPUK']]
print '* Only frequent in PUKs posts:'
print ', '.join(str(x) for x in df_pukonly['word'].values)
print '* Only frequent in not PUKs posts:'
print ', '.join(str(x) for x in df_notpukonly['word'].values)
print '* Frequent words in both, more frequent for PUK:'
print ', '.join(str(x) for x in df_morepuk['word'].values)
print '* Frequent words in both, more frequent for not PUK:'
print ', '.join(str(x) for x in df_morenotpuk['word'].values)

## Frequent words per author, per category
Our first approximation overcount some words, when they are repeated by the same person. Therefore here we count the number of times it is mentioned by a new person.

In [None]:
with io.open('posts_puk.json',encoding='utf-8') as f:
    puk = json.load(f)
with io.open('posts_notpuk.json') as f:
    notpuk = json.load(f)

comp = []
def count_s(s,data):
    count = 0
    if data == puk:
        for i in range(len(data)):
            if s in data[i]['content']:
                count += 1
        comp.append(["Parkinson's UK",s,count,round(100*count/len(data),1)])
    else:
        authorsaidit = []
        for i in range(len(data)):
            if s in data[i]['content'] :
                if data[i]['person_hash_id'] not in authorsaidit:
                    authorsaidit.append(data[i]['person_hash_id'])
                    count += 1
        comp.append(["Not Parkinson's UK",s,count,round(100*count/len(data),1)])
    return comp

for w in freqwords_all:
    count_s(w,puk)
    count_s(w,notpuk)
df_comp_all = pd.DataFrame(comp,columns=['AuthorType','Word','NbpostsAuthors','Percentage'])
    

df_scatter = df_comp_all.copy()
df_scatter = df_scatter[['AuthorType','Word','Percentage']].set_index(['AuthorType','Word'],append=True)
df_scatter = df_scatter.unstack('AuthorType')
df_scatter = df_scatter.stack(0)
df_scatter = df_scatter.reset_index().drop(['level_0','level_2'],axis=1)
df_scatter["Not Parkinson's UK"] = df_scatter["Not Parkinson's UK"].fillna(method='bfill')
df_scatter["Parkinson's UK"] = df_scatter["Parkinson's UK"].fillna(method='ffill')
df_scatter = df_scatter.drop_duplicates('Word')

#print df_scatter
#df_comp_puk = df_comp[['Word','NbpostsAuthors']].ix[df_comp['AuthorType']=="Parkinson's UK"]
#df_comp_notpuk = df_comp[['Word','NbpostsAuthors']].ix[df_comp['AuthorType']=="Not Parkinson's UK"]

import matplotlib
matplotlib.use('agg') # I think this was to solve a problem but it didn't work

#df_scatter.plot.scatter(x="Parkinson's UK", y="Not Parkinson's UK")
# set up figure and ax
fig, ax = plt.subplots(figsize=(8,8))

# the scatter plot:
labels = df_scatter['Word']
for label, x, y in zip(labels, df_scatter["Parkinson's UK"],df_scatter["Not Parkinson's UK"]):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (10, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.1', fc = 'white', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

x = df_scatter["Parkinson's UK"]
y = df_scatter["Not Parkinson's UK"]
fit = np.polyfit(x, y, deg=1)
ax.plot(x, fit[0] * x + fit[1], 'g--')
ax.plot([0,60],[0,60], 'r--', label='Random guess')
ax.scatter(df_scatter["Parkinson's UK"], df_scatter["Not Parkinson's UK"], c='blue', s = 1)

from bokeh.charts import Scatter, show
from bokeh.io import output_notebook
output_notebook()
from bokeh.sampledata.iris import flowers as data

scatter = Scatter(data, x='petal_length', y='petal_width',
                  color='species', marker='species',
                  title='Iris Dataset Color and Marker by Species',
                  legend=True)

#output_file("iris_simple.html", title="iris_simple.py example")

show(scatter,notebook_handle=True)


We look at a list of words which seem to have a different frequency in both texts:

* 'dad' vs 'mum'
* 'disease','condition'
* 'help','support'
* 'diagnosis', 'diagnosed'
* 'research','money'
* 'family','friends'

In [None]:
comp = []
listoflistwords = [['dad','mum'],['disease','condition'],['help','support'],['diagnosis', 
             'diagnosed'],['research','money'],['family','friends']]
listwords = ['dad','mum','disease','condition','help','support','diagnosis', 
             'diagnosed','research','money','family','friends']
def barplotdata(listoflistwords,plotnb):
    w2 = listoflistwords[plotnb-1]
    for w in w2:
        count_s(w,puk)
        count_s(w,notpuk)
    df_comp = pd.DataFrame(comp,columns=['AuthorType','Word','NbpostsAuthors','Percentage'])
    return df_comp[['Percentage','Word']].ix[(df_comp['AuthorType']=="Parkinson's UK")], df_comp[['Percentage','Word']].ix[(df_comp['AuthorType']=="Not Parkinson's UK")]


def create_subplot(nbplot,ax=None):
    plot1puk, plot1notpuk = barplotdata(listoflistwords,nbplot)
    plota = plot1puk.plot(kind='bar', color='red', ax=ax, position=0, width=0.25)
    plotb = plot1notpuk.plot(kind='bar', color='blue', ax=ax, position=1, width=0.25)
    return plota, plotb

# make figure with subplots
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2,3,sharey=True, figsize=(10,5))

# Plots
create_subplot(1,ax1)
ax1.set_xticklabels(listoflistwords[0], minor=False,rotation=0)
comp = []
create_subplot(2,ax2)
ax2.set_xticklabels(listoflistwords[1], minor=False,rotation=0)
comp = []
create_subplot(3,ax3)
ax3.set_xticklabels(listoflistwords[2], minor=False,rotation=0)
comp = []
create_subplot(4,ax4)
ax4.set_xticklabels(listoflistwords[3], minor=False,rotation=0)
comp = []
create_subplot(5,ax5)
ax5.set_xticklabels(listoflistwords[4], minor=False,rotation=0)
comp = []
create_subplot(6,ax6)
ax6.set_xticklabels(listoflistwords[5], minor=False,rotation=0)

ax1.legend(["Parkinson's UK","Not Parkinson's UK"],fancybox=True,loc='upper left')
ax2.legend().remove()
ax3.legend().remove()
ax4.legend().remove()
ax5.legend().remove()
ax6.legend().remove()


## Context
Let's look now into the context in which these words are used

In [None]:
# we need to use the tokenisation from NLTK
import nltk.collocations
import nltk.corpus
import collections
from nltk import word_tokenize, FreqDist
#for x in content_puk:
#    print x
#[print str(x) for x in content_puk]
text_puk = ' | '.join(x.lower() for x in content_puk)
text_notpuk = ' | '.join(x.lower() for x in content_notpuk)
textnltk_puk = nltk.Text(word_tokenize(text_puk))
textnltk_notpuk = nltk.Text(word_tokenize(text_notpuk))
def find_unique_exp(text,exp):
    uniqu = []
    match_tokens = TokenSearcher(text).findall(exp)
    for x in match_tokens:
        uniqu.append(' '.join(x))
    return list(set(uniqu))

Let's first look a the context in which some words appear. 
Puk and Not PUK talked about dad and mum at a different frequency. Let's see in which context these appear.
#For instance, puk and not puk talked with a different frequency.

In [None]:
print 'Unique expressions of PUK for the female parent: '
print find_unique_exp(textnltk_puk,r"<.*> <.*> <mum> | <.*> <.*> <mums>| <.*> <.*> <mother>| <.*> <.*> <mothers>")
print '** and for the male parent'
print find_unique_exp(textnltk_puk,r"<.*> <.*> <dad> | <.*> <.*> <dads>| <.*> <.*> <father>| <.*> <.*> <father>")
print '------------------'
print 'Unique expressions of Not PUK for the female parent: '
print find_unique_exp(textnltk_notpuk,r"<.*> <.*> <mum> | <.*> <.*> <mums>| <.*> <.*> <mother>| <.*> <.*> <mothers>")
print '** and for the male parent'
print find_unique_exp(textnltk_notpuk,r"<.*> <.*> <dad> | <.*> <.*> <dads>| <.*> <.*> <father>| <.*> <.*> <father>")
# maybe could look into the occurences of 'my dad' vs 'my mum' = are they speaking in general about a parent or 
#about their parent?

In [None]:
# Expressions that contain 'Parkinson's
print '* PUK: Find 3-word expression ending with Parkinson:'
puk_3words_parkinson = find_unique_exp(textnltk_puk,r"<.*> <.*> <parkinson> | <.*> <.*> <parkinsons>")
#print puk_3words_parkinson
print '* Not PUK: Find 3-word expression ending with Parkinson:'
notpuk_3words_parkinson =  find_unique_exp(textnltk_notpuk,r"<.*> <.*> <parkinson> | <.*> <.*> <parkinsons>")
print '-------'
print '* PUK: Find 2-word expression with money:'
print find_unique_exp(textnltk_puk,r"<.*> <money>")
print '* Not PUK: Find 2-word expression with money:'
print find_unique_exp(textnltk_notpuk,r"<.*> <money>")

In [None]:
print '-------'
print '* Find words that often appear together:'
#print textnltk_puk.collocations()
print '-------'
print textnltk_puk.concordance('mum')
print textnltk_notpuk.concordance('mum')
print '-------'
#print textnltk_puk.concordance('raise')
#print textnltk_notpuk.concordance('raise')

In [None]:
# We could look into the context in which 'challenge' appear.

In [None]:
# Money seems to be more often mentioned by not puk
# what about the 'raise money' vs 'raise awarennes'

In [None]:
from collections import defaultdict 

bgm    = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(textnltk_puk)
finder_notpuk = nltk.collocations.BigramCollocationFinder.from_words(textnltk_notpuk)
scored = finder.score_ngrams(bgm.likelihood_ratio)
scored_notpuk = finder_notpuk.score_ngrams(bgm.likelihood_ratio)

# Group bigrams by first word in bigram.                                        
prefix_keys = collections.defaultdict(list)
for key, scores in scored:
   prefix_keys[key[0]].append((key[1], scores))

# Sort keyed bigrams by strongest association.                                  
for key in prefix_keys:
   prefix_keys[key].sort(key = lambda x: -x[1])

print 'parkinsons', prefix_keys['parkinson'][:5]
print 'diagnosed', prefix_keys['diagnosed'][:5]
print 'affected', prefix_keys['affected'][:5]
print 'raise', prefix_keys['raise'][:5]


Now let's look at how efficient the different posts are = which one give more reactions?

They seem to be talking about raising money, research, awareness, ..? maybe need first to read all the posts. Argh

How often do people post?