# PART 01 - Feature Selection

In [1]:
import time
print(time.time())

1469668764.461166


In [64]:

#generic imports
import json
import pandas as pd
import numpy as np
import re
import string

#pyspark imports
from pyspark import SQLContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

#text analytics imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer


# Normally don't need to initialise the sql (hive) context with
#sqlContext = SQLContext(sc) 
# it should already be available under the variable 'sqlContext'. Are you able to replace all occurrences of sql with sqlContext and remove the above line?
#from pyspark.sql import HiveContext
#sqlContext = HiveContext(sc)


#regex patterns
problemchars = re.compile(r'[\[=\+/&<>;:!\\|*^\'"\?%$.@)°#(_\,\t\r\n0-9-—\]]')
url_finder = re.compile(r'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emojis = re.compile(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+', 
    re.UNICODE)
stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# username = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)')
username = re.compile(r'(@)\w+( )')
# hashtag = re.compile(r'#(\w+)')
redate = re.compile(r'^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$')
reempty = re.compile(r'^$|\s+')


#nltk sets
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()
LEMMER = WordNetLemmatizer()
tweet_tokenizer = TweetTokenizer()


KEYWORDS=["protest","muslim","islam","outcry","asylum","organisation","threat","union","centrelink","opposition","parade","council",
"federal","strike","harass","refugee","riot","community","reclaim","poster","demonstration","petition","funding","barrier","march",
"crowd","celebration","action","barricade","placard","gather","resident","patriot","bigot","racism","national","decision","movement",
"mentality","racist","agency","mosque","highlight","halaal","turmoil","activist","disturbance","victory","equality","blockade","anger",
"ideal","unite","extremist","anzac","rally","culture","unrest","terror","terrorist"]


#based on cities in the GSR
CITIES=["melbourne","canberra", "adelaide", "perth", "brisbane", "sydney", "hobart", "borroloola", "fremantle","darwin", 
"orchard hills","geelong", "gold coast","pilliga","bendigo","cairns", "townsville","seaspray", "port kembla","warrnambool", 
"devonport","mackay", "onkaparinga","ballina", "goldcoast","wollongong","port botany","kwinana", "ballandean","yeelanna", 
"ipswich","crafers", "ulverstone","hunter valley","sunshine coast", "bermagui", "hervey bay","tatura", "inverell", "port adelaide", 
"pilliga","newcastle","maningrida","gippsland", "heirisson island","northam","morwell","drayton", "armidale", "raymond terrace",
"st kilda", "melton", "upper hunter", "maroubra", "alphington","tomago", "southport","launceston","freemantle","wangaratta", 
"maitland", "grafton","haberfield","kew", "richmond", "alice springs","rockhampton","dalby", "laverton", "nowra","mount isa",
"murrumbeena", "bondi beach", "ballarat", "coffs harbour","whyalla", "bulga","kirribilli","victor harbour", "syndey", 
"lismore","moulting lagoon","adelaide hills", "collingwood", "coolangata","wodonga","kilcoy", "rockingham", "williamton",
"lake illawarra", "salamander bay", "maules creek", "gosford","newcastke","canbera","mount gambier", "parkes", "great keppel island",
"edgecliff","coolum", "homebush", "heirisson island", "tiwi island","albany", "narrabri", "lapoinya", "parkdale", "broken hill", 
"geraldton","kilburn","victoria","illawarra", "esssendon"]

#days of week - many protest related tweets have a day of the week in the tweet body
DOW = ["monday","tuesday","wednesday","thrusday","friday","saturday","sunday"]

#future action verbs
ACTION = ["accelerate","accomplish","achieve","acquire","activate","adapt","adjust","administer","advise","allocate","analyse","annotate",
"anticipate", "apply", "appraise", "arrange", "articulate", "assemble", "assess", "assign", "attain", "author", "balance", "brief", 
"budget", "calculat", "catalogue", "categorise", "chair", "chang", "channel", "chart", "clarify", "coach", "code", "collaborate", 
"collect", "communicate", "compare", "compete", "compile", "complete", "compose", "compute", "conceive", "conduct", "confront", 
"consolidate", "construct", "contact", "continue", "contract", "convene", "coordinate", "correspond", "counsel", "create", "critique", 
"define", "delegate", "deliver", "demonstrate", "derive", "design", "detect", "determine", "develop", "devise", "diagnose", "direct", 
"discover", "dispense", "display", "distribute", "draft", "dramatise", "earn", "edit", "educate", "effect", "elicit", "employ", 
"encourage", "endure", "enlist", "entertain", "establish", "estimate", "evaluate", "examine", "exchange", "execute", "exercise", 
"exhibit", "expand", "expedite", "experiment", "explain", "explore", "facilitate", "finance", "focus", "forecast", "formulate", 
"foster", "group", "guide", "identify", "illustrate", "implement", "impose", "improve", "increase", "influence", "inform", "initiate", 
"inquire", "inspect", "install", "instill", "institute", "instruct", "insure", "interpret", "intervene", "interview", "introduce", 
"invent", "inventory", "investigate", "judge", "lecture", "listen", "locate", "maintain", "manage", "market", "master", "measure", 
"mediate", "model", "modify", "mold", "monitor", "motivate", "negotiate", "observe", "obtain", "operate", "organise", "outline", 
"oversee", "participate", "perceive", "perform", "persuade", "plan", "predict", "prepare", "prescribe", "present", "preside", 
"process", "produce", "programme", "promote", "protect", "provide", "publicise", "publish", "purchase", "question", "recommend", 
"record", "recruit", "reduce", "regulate", "reinforce", "render", "repair", "report", "represent", "reproduce", "research", "resolve",
"respond", "restore", "retain", "retrieve", "review", "revise", "rewrite", "route", "schedule", "search", "select", "serve", "service", 
"shape", "share", "simplify", "solicit", "solve", "seek", "specify", "stimulate", "study", "succeed", "suggest", "summarise", 
"supervise", "support", "survey", "synthesise", "systematise", "target", "teach", "test", "train", "translate", "tutor", "update", 
"utilise", "verify", "visualise", "write"]

NEGATIVE = ["abandon", "abuse", "accuse", "addict", "afraid", "aggravated", "aggressive", "alone", "angry", "anguish", "anxious",
"apprehensive", "argumentative", "artificial", "ashamed", "assaulted", "loss", "risk", "atrocious", "attacked", "avoided", "awful",
"awkward", "bad", "badge", "baffled", "banned", "barren", "beat", "beaten", "belittled", "berated", "betrayal", "betrayed", "bitched",
"bitter", "bizzare", "blacklisted", "blackmailed", "blame", "blamed", "bleak", "blownaway", "blur", "bored", "boring", "bossed-around",
"bothered", "bothersome", "bounded", "boxed", "broken", "bruised", "brushed-off", "bugged", "bullied", "bummed", "bummedout", "burdened",
"burdensome", "burned-out", "caged", "careless", "chaotic", "chased", "cheated", "cheatedon", "chicken", "claustrophobic", "clingy", "closed",
"clueless", "clumsy", "coaxed", "codependent", "coerced", "cold-hearted", "combative", "commanded", "compared", "competitive", "compulsive",
"conceited", "concerned", "condescended", "confined", "conflicted", "confronted", "confused", "conned", "consumed", "contemplative", "contempt",
"contentious", "controlled", "convicted", "cornered", "corralled", "cowardly", "crabby", "cramped", "cranky", "crap", "crappy", "crazy", 
"creeped","out", "creepy", "critical", "criticized", "cross", "crowded", "cruddy", "crummy", "crushed", "cut-down", "cut-off", "cynical",
"abysmal", "adverse","alarming", "annoy", "apathy", "appalling", "banal", "barbed", "belligerent", "bemoan", "beneath", "callous", "cant", 
"coarse", "collapse", "contradictory", "contrary", "corrosive", "corrupt", "criminal", "cruel", "cry", "cutting", "dead", "decaying", "damage",
"damaging", "dastardly", "deplorable", "depressed", "deprived", "deformed", "deny", "despicable", "detrimental", "dirty", "disease", "disgusting",
"disheveled", "dishonest", "dishonorable", "dismal", "distress", "dont", "dreadful", "dreary", "enraged", "eroding", "evil", "fail", "faulty",
"fear", "feeble", "fight", "filthy", "foul", "frighten", "frightful", "gawky", "ghastly", "grave", "greed", "grim", "grimace", "gross", "grotesque",
"gruesome", "guilt", "guilty", "haggard", "hard", "hard-hearted", "harmful", "hate", "hideous", "homely", "horrendous", "horrible", "hostile", "hurt",
"hurtful", "icky", "ignore", "ignorant", "ill", "immature", "imperfect", "impossible", "inane", "inelegant", "infernal", "injure", "injurious", "insane",
"insidious", "insipid", "jealous", "junky", "lose", "lousy", "lumpy", "malicious", "mean", "menacing", "messy", "misshapen", "missing", "misunderstood",
"moan", "moldy", "monstrous", "naive", "nasty", "naughty", "negate", "negative", "never", "no", "nobody", "nondescript", "nonsense", "not", "noxious",
"objection", "objectionable", "odious", "offensive", "old", "oppressive", "pain", "perturb", "pessimistic", "petty", "plain", "poisonous", "poor",
"prejudice", "questionable", "quirky", "quit", "reject", "renege", "repellant", "reptilian", "repulsive", "repugnant", "revenge", "revolting", "rocky",
"rotten", "rude", "ruthless", "sad", "savage", "scare", "scary", "scream", "severe", "shoddy", "shocking", "sick", "sickening", "sinister", "slimy",
"smelly", "sobbing", "sorry", "spiteful", "sticky", "stinky", "stormy", "stressful", "stuck", "stupid", "substandard", "suspect", "suspicious", "tense",
"terrible", "terrifying", "threatening", "ugly", "undermine", "unfair", "unfavorable", "unhappy", "unhealthy", "unjust", "unlucky", "unpleasant", 
"upset", "unsatisfactory", "unsightly", "untoward", "unwanted", "unwelcome", "unwholesome", "unwieldy", "unwise", "vice", "vicious", "vile",
"villainous", "vindictive", "wary", "weary", "wicked", "woeful", "worthless", "wound", "yell", "yucky", "zero", "damaged", "damned", "dangerous",
"dark", "dazed", "deceived", "deep", "defamed", "defeated", "defective", "defenseless", "defensive", "defiant", "deficient", "deflated", "degraded",
"dehumanized", "dejected", "delicate", "deluded", "demanding", "demeaned", "demented", "demoralized", "demotivated", "dependent", "depleted",
"depraved", "deserted", "deserving", "punishment", "desolate", "despair", "despairing", "desperate", "despised", "destroyed", "destructive", 
"detached", "detest", "detestable", "detested", "devalued", "devastated", "deviant", "devoid", "diagnosed", "dictatedto", "different", "difficult",
"directionless", "disabled", "disagreeable", "disappointed", "disappointing", "disapprovedof", "disbelieved", "discardable", "discarded",
"disconnected", "discontent", "discouraged", "discriminated", "disdain", "disdainful", "disempowered", "disenchanted", "disgraced", 
"disgruntled", "disgust", "disgusted", "disheartened", "disillusioned", "dislike", "disliked", "dismayed", "disorganized", "disoriented",
"disowned", "displeased", "disposable", "disregarded", "disrespected", "dissatisfied", "distant", "distracted", "distraught", "distressed",
"disturbed", "dizzy", "dominated", "doomed", "double-crossed", "doubted", "doubtful", "down", "downandout", "downinthedumps", "downhearted", 
"downtrodden", "drained", "dramatic", "dread", "dropped", "drunk", "dry", "dumb", "dumped", "dumpedon", "duped", "edgy", "egocentric", "egotistic",
"egotistical", "elusive", "emancipated", "emasculated", "embarrassed", "emotional", "emotionless", "emotionally", "bankrupt", "empty", "encumbered",
"endangered", "enslaved", "entangled", "evaded", "evasive", "evicted", "excessive", "excluded", "exhausted", "exploited", "exposed", "failful", 
"fake", "FALSE", "fearful", "fedup", "flawed", "forced", "forgetful", "forgettable", "forgotten", "fragile", "freakedout", "frightened", "frigid", 
"frustrated", "furious", "gloomy", "glum", "gothic", "grey", "grief", "grossed-out", "grouchy", "grounded", "grumpy", "guilt-tripped", "harassed",
"harmed", "hassled", "hateful", "hatred", "haunted", "heartbroken", "heartless", "heavy-hearted", "helpless", "hesitant", "hindered", "hopeless",
"horrified", "horror", "hot-tempered", "humiliated", "hungup", "hungover", "hurried", "hysterical", "idiotic", "ignored", "ill-tempered", "imbalanced",
"imposed-upon", "impotent", "imprisoned", "impulsive", "inthedumps", "intheway", "inactive", "inadequate", "incapable", "incommunicative", "incompetent",
"incompatible", "incomplete", "incorrect", "indecisive", "indifferent", "indoctrinated", "inebriated", "ineffective", "inefficient", "inferior", 
"infuriated", "inhibited", "inhumane", "injured", "injusticed", "insecure", "insignificant", "insincere", "insufficient", "insulted", "intense",
"interrogated", "interrupted", "intimidated", "intoxicated", "invalidated", "invisible", "irrational", "irritable", "irritated", "isolated",
"jaded", "jerkedaround", "joyless", "judged", "keptapart", "keptaway", "keptin", "keptout", "keptquiet", "labeled", "laughable", "laughedat", 
"lazy", "leanedon", "lecturedto", "leftout", "letdown", "liedabout", "liedto", "limited", "little", "lonely", "lonesome", "longing", "lost", 
"loveless", "low", "mad", "madefunof", "manhandled", "manipulated", "masochistic", "messedwith", "messedup", "miffed", "miserable", "misled",
"mistaken", "mistreated", "mistrusted", "mixed-up", "mocked", "molested", "moody", "nagged", "needy", "nervous", "neurotic", "nonconforming",
"numb", "nuts", "nutty", "objectified", "obligated", "obsessed", "obsessive", "obstructed", "odd", "offended", "ondisplay", "opposed", 
"oppressed", "outofplace", "outoftouch", "over-controlled", "over-protected", "overwhelmed", "panic", "paranoid", "passive", "pathetic",
"petrified", "phony", "pickedon", "pissed", "pissedoff", "playedwith", "pooped", "powerless", "pre-judged", "preachedto", "preoccupied", 
"predjudiced", "pressured", "prosecuted", "provoked", "psychopathic", "psychotic", "pulledapart", "pulledback", "punished", "pushed", 
"pushedaway", "putdown", "puzzled", "quarrelsome", "queer", "questioned", "quiet", "rage", "raped", "rattled", "regret", "rejected", "resented",
"resentful", "responsible", "retarded", "avenge", "evenge", "revengeful", "ridiculed", "ridiculous", "robbed", "sadistic", "sarcastic",
"scared", "scarred", "screwed", "screwedover", "screwedup", "self-centered", "self-conscious", "self-destructive", "self-hatred", "selfish", 
"sensitive", "shoutedat", "shy", "singled-out", "slow", "small", "smothered", "snappedat", "stereotyped", "strange", "stressed", "stretched",
"submissive", "suffering", "suffocated", "suicidal", "superficial", "suppressed"]


RELIGION = ["koran", "quran", "islam", "church", "'asr", "asr", "bismillah", "dua", "dojakh", "jum'ah", "jumah", "jahanam", "mazhab",
"majhab", "spiritual", "faith", "hindu", "mosque", "masjid", "peace", "buddhism", "buddha", "hinduism", "devil", "jesus", "allah", "khuda",
"bible", "mullah", "muslman", "christian", "catholic", "prayer", "atheist", "jew", "religious", "muslim", "grave", "hijab", "hizb",
"demon", "jihad", "god", "maulana", "kabba", "azan", "burka", "prophet", "koum", "apostate", "sikh", "muhajir", "immigrant", "hijra", "hijr",
"amen", "fatwa", "dargah", "mandir", "temple", "orthodox", "profane", "profanity", "adoration", "agape", "allegorical", "angel", "anglican", 
"apologetics", "apostle", "apostolic", "apostolic", "fathers", "apostolic", "succession", "arianism", "assembly", "assurance", "baptism", 
"bible", "bishop", "church", "congregation", "coptic", "church", "criticism", "cross", "denominational", "deacon", "dean", "diocese", "disciple",
"dissenter", "elim", "church", "episcopal", "escatology", "'established", "church'", "eschatology", "evangelist", "excegesis", "eucharist",
"filioque", "first", "fruits", "jehovah", "heaven", "hell", "messiah", "miracle", "mission", "monotheism", "mystery", "religions", "myth",
"non-conformist", "obedience", "omnipotence", "omniprescence", "ordination", "original", "sin", "orthodox", "orthodox", "church", "pantheism", 
"paradise", "parousia", "paschal", "passover", "pastor", "pluralism", "priest", "polytheism", "prayer", "predestination", "propitiation", 
"prophecy", "prophet", "purgatory", "redemption", "reformation", "repentance", "resurrection", "revelation", "rite", "ritual", "romantic", 
"movement", "sabellian", "sacrament", "sacrifice", "sacristan", "sacristy", "salvation", "sanctification", "sanctum", "sanctus", "sanctuary", 
"satan", "schism", "scripture", "secularisation", "sin", "socinian", "son", "of", "god", "son", "of", "man", "soul", "source", "criticism", 
"suffering", "suffragan", "trinity", "understand", "something", "unitarian", "vicar", "yahweh", "abbot", "abide", "absolution", "acolyte", 
"affect", "affirm", "afterlife", "agnostic", "aid", "alleluia", "altar", "ancestors", "angel", "anglican", "apostle", "archangel", "archbishop", 
"asceticism", "atone", "attendance", "awe", "banns", "baptism", "beads", "beatitudes", "belief", "believer", "benediction", "benefits", 
"benevolent", "bestow", "bishop", "bless", "blessings", "bliss", "born", "again", "bow", "candles", "canon", "cantor", "care", "cathedral", 
"celebrant", "celestial", "ceremony", "chancel", "chapel", "chaplain", "charity", "cherub", "choices", "choir", "chorale", "chorus", 
"christianity", "church-going", "clergy", "cloister", "comfort", "commandments", "commitment", "communion", "community", "compassion",
"comprehension", "conclave", "confession", "confidence", "confirmation", "conflicted", "congregation", "connection", "conscience",
"consecrate", "conservative", "contemplate", "convent", "conversion", "convert", "conviction", "convocation", "core", "counsel", "courage", 
"coven", "covenant", "creator", "credence", "credibility", "credo", "creed", "cross", "crusade", "curative", "decision", "dedication", "deism",
"deity", "deliverance", "denomination", "devotee", "devotion", "devout", "diaspora", "disciple", "discipline", "discussion", "divine", 
"divinity", "doctrine", "dogma", "doubts", "doxology", "duty", "ecclesiastical", "effect", "elder", "elevate", "embodiment", "emotion", 
"empathy", "enlightenment", "epiphany", "epistle", "essence", "eternal", "ethics", "eucharist", "evangelical", "everlasting", "exalt", 
"exaltation", "exodus", "express", "faithful", "family", "fasting", "fate", "father", "forgiveness", "fundamental", "gentile", "genuflect",
"glory", "godliness", "good", "news", "goodness", "gospel", "grace", "gratitude", "graven", "growth", "guidance", "guilt", "guru", "habit",
"habitual", "hallow", "halo", "happiness", "harmony", "healing", "heathen", "heaven", "hebrew", "heresy", "heretic", "holy", "holy", "days", 
"hope", "host", "humane", "humble", "humility", "hymn", "hymnal", "idol", "idyllic", "illuminate", "immortal", "implication", "in", "vain", 
"incarnate", "indulgence", "infallible", "infidel", "influence", "inherent", "insight", "inspiration", "instruct", "integral", "intercede",
"interdenominational", "interfaith", "intuition", "investiture", "invocation", "issue", "jesuit", "joy", "joyful", "judaism", "judgment",
"just", "karma", "keen", "keep", "watch", "kingdom", "kneel", "laity", "latin", "laud", "lay", "person", "leaded", "glass", "life", "light",
"litany", "love", "loving", "loyalty", "manifestation", "mantra", "marriage", "martyr", "meanings", "meditate", "mega-church", "mercy", "messiah",
"mindful", "minister", "miracles", "mission", "missionary", "monarchy", "monastery", "monastic", "monotheism", "morality", "mormon", "mortal",
"movement", "music", "mystery", "mystical", "nature", "neophyte", "nod", "nomadic", "nonbeliever", "novice", "nun", "nurture", "observance", 
"offertory", "official", "omnipotent", "omniscience", "oracle", "oration", "ordain", "order", "organ", "outlook", "pacific", "pagan", "papal",
"parish", "participate", "pastor", "pastoral", "patron", "saint", "peal", "penance", "penchant", "perception", "permanence", "perpetual", 
"persevere", "personal", "perspective", "petition", "piety", "pilgrim", "pilgrimage", "polytheism", "pontiff", "postulate", "power", "practice",
"pray", "preach", "premonition", "prescient", "presence", "priest", "principle", "priory", "privacy", "private", "proclaim", "profound", "programs",
"promise", "proof", "proselytize", "prosperity", "protection", "psalm", "psalter", "pulpit", "purity", "purpose", "query", "quest", "questions",
"quiet", "quintessence", "qur'an", "radical", "rally", "realization", "reassurance", "rebirth", "reciprocal", "rector", "redemption", "refectory", 
"reflection", "refuge", "reincarnation", "relationship", "relative", "religion", "repent", "resent", "restrict", "retreat", "revelation",
"reverence", "reverent", "revile", "righteous", "rites", "ritual", "role", "rosary", "sacrament", "sacred", "sacrifice", "sacrilege", "sage", 
"saint", "saintly", "salvation", "same-sex", "sanctification", "sanctify", "sanctity", "sanctuary", "saturday", "savior", "scandal", "schism",
"scripture", "sect", "sectarian", "secular", "security", "seeker", "seminary", "sense", "sensitive", "serenity", "serious", "sermon", "serve", 
"service", "sexton", "sharia", "law", "shepherd", "shinto", "shrine", "silence", "sin", "sinful", "skeptical", "society", "solace", "solemn", 
"solitude", "sorrow", "soul", "source", "spirit", "split", "stance", "statute", "succor", "suffering", "sufi", "sunday", "supernatural", 
"support", "supreme", "sustenance", "synagogue", "tabernacle", "talmud", "taoism", "teaching", "tests", "text", "theologian", "theology",
"timelessness", "tithe", "torah", "tradition", "traditional", "tranquility", "transcendence", "transgression", "transitory", "trepidation",
"trespass", "tribute", "trust", "unction", "understanding", "unique", "unity", "universal", "unknown", "uplift", "validation", "valor", 
"value", "vanity", "venial", "sin", "vespers", "vestment", "vicar", "vicissitudes", "vigil", "virgin", "mary", "virtue", "vision", "visitation",
"vizier", "voice", "voices", "volunteering", "vows", "watch", "wayward", "weight", "whole", "wisdom", "witness", "woe", "wonders", "word", 
"worldwide", "worry", "worship", "worthiness", "yang", "yearning", "yin", "yogi", "youth", "programs", "zeal", "zealot", "zealous"]


#list for NOISE tweets related to sport, entertainment etc
NOISE = ["eurovision","pregnancy","abortion","vaccination",
"cricket","football","basketball","tennis","soccer","swimming","rugby","nba","afl","nrl","wnbl","wicket",
"surf","ultraviolet","sunscreen","sunburn","cannabis","suncor","thetyee","philippines","duterte","unilife",
"maya","timberlake","thrones","housewives","gosling","depp","hatter","kardashian","kidman","madonna","prince",
"jolie","beckham","rhianna","gaga","bieber","nishikori","draymond",
"mkr", "election","auspol"]


#sqlContext = SQLContext(sc) 

#SELECT YOUR DATE RANGE
startTime = "2015-07-17T05:00:00+00:00"
endTime   = "2015-07-17T11:00:00+00:00"
#endTime   = "2015-07-17T01:00:00+00:00"
#endTime   = "2015-07-18T23:59:00+00:00"


#generic function to create dataframe for further exploration
def flatten(indict, current_key=None, outerdict=None):
    if outerdict is None:
        outerdict = {}
    for key, value in indict.items():
        newkey = current_key + '__' + key if current_key else key
        if type(value) is not dict:
            outerdict[newkey] = value
        else:
            flatten(value, current_key=newkey, outerdict=outerdict)
    return outerdict

#extractwords from twitter
def word_extraction(bodytext):
    nltk.data.path.append("/local/hdfs-volume/data/nltk_data")
    
    token_list = []
    tokens = tweet_tokenizer.tokenize(bodytext)
    
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION and not letter.isdigit()])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    stemmed = [LEMMER.lemmatize(w) for w in no_stopwords]
    no_links = [w for w in stemmed if (not 'http' in w) and len(w)>2]
    #return [w for w in no_links if w]
    #for w in no_links:
    #    token_list.append(w)
    return no_links


#function to lookup and count occurences of specific words in Tweet body
def list_validator(bodyText, lookup_list):
    
    #append nltk libraries
    nltk.data.path.append("/local/hdfs-volume/data/nltk_data")
    
    tokens = tweet_tokenizer.tokenize(bodyText)
    count=0
    
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION and not letter.isdigit()])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    #stemmed = [STEMMER.stem(w) for w in no_stopwords]
    stemmed = [LEMMER.lemmatize(w) for w in no_stopwords]
    no_links = [w for w in stemmed if (not 'http' in w) and len(w)>2]
    for w in no_links:
        if w in lookup_list:
            count+=1
            
    return count



#function using regex to determine if tweet contains a date
def has_date(bodyText):
    
    if redate.search(bodyText) is not None:
        hasdate = 1
    else:
        hasdate = 0
    
    return hasdate


#sentiment extractors
#extract sentiment wiht UDF
def get_pos_sentiment(data_str):
    if data_str is not None:
        return json.loads(data_str).get("positiveScore","0") 
    else:
        return 0
    

def get_neg_sentiment(data_str):
    if data_str is not None:
        return json.loads(data_str).get("negativeScore")
    else:
        return 0
            


#binary politicalParty function
def polilicatPartyToCategory(politicalParty):
    if politicalParty is '': return '0'
    else: return '1'



#function to filter as RDD and output as Dataframe
def create_dataframe(startTime, endTime):
    
    
    df = sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.tweets", 
                   startTime = startTime, 
                   endTime   = endTime)
    
    print('Original size of dataset: ' + str(df.count()))    
    
    #filter and select RDD object - also includes regex based text cleaning
    df_flat = df.map(lambda row: row.data)\
        .map(lambda data: json.loads(data))\
        .filter(lambda r: "location" in r)\
        .filter(lambda line_tuple: 'en' in line_tuple['twitter_lang'])\
        .map(lambda line: flatten(line))\
        .filter(lambda line_tuple: 'Australia' in line_tuple['location__country_code'])\
        .map(lambda line: (
                           line.get('id',''),
                           line.get('actor__id',''),
                           line.get('actor__displayName',''),
                           line.get('actor__favoritesCount','0'),
                           line.get('actor__followersCount','0'),
                           line.get('actor__friendsCount','0'),
                           line.get('actor__listedCount','0'),
                           line.get('retweetCount','0'),
                           line.get('actor__postedTime',''),
                           #stop.sub('', problemchars.sub('', emojis.sub('', url_finder.sub('', username.sub('', line.get('body','').lower().strip()))))),
                           line.get('body',''),
                           word_extraction(line.get("body",'')),
                           list_validator(line.get("body",''), KEYWORDS),
                           list_validator(line.get("body",''), CITIES),
                           has_date(line.get("body",'')),
                           list_validator(line.get("body",''), DOW),
                           list_validator(line.get("body",''), ACTION),
                           list_validator(line.get("body",''), NEGATIVE),    
                           list_validator(line.get("body",''), RELIGION),  
                           line.get('geo__coordinates',''),
                           line.get('location__country_code',''),
                           line.get('location__displayName',''),
                           line.get('location__name',''),
                           line.get('location__twitter_place_type',''),
                           line.get('gnip__klout_score','0'),
                           line.get('gnip__matching_rules',''),
                           line.get('twitter_entities__hashtags',''),
                           line.get('twitter_entities__user_mentions',''),
                           line.get('twitter_lang',''),
                           list_validator(line.get("body",''), NOISE)
            ))
    
    
    print('Filtered size of dataset: {0}'.format(str(df_flat.count())))  
    return
    
    #define schema and variable types
    fields0 =  [
          StructField("id", StringType(), True),
          StructField("actor__id", StringType(), True),
          StructField("actor__displayName", StringType(), True),
          StructField("actor__favoritesCount", StringType(), True),
          StructField("actor__followersCount", StringType(), True),
          StructField("actor__friendsCount", StringType(), True),
          StructField("actor__listedCount", StringType(), True),
          StructField("retweetCount", StringType(), True),
          StructField("actor__postedTime", StringType(), True),
          #StructField("body_CLEAN", StringType(), True),
          StructField("body", StringType(), True),
          StructField("body_TOKENS", ArrayType(StringType(), True), True),
          StructField("count_PROTEST", IntegerType(), True),
          StructField("count_CITY", IntegerType(), True),
          StructField("has_DATE", IntegerType(), True),
          StructField("count_DOW", IntegerType(), True),
          StructField("count_ACTION", IntegerType(), True),
          StructField("count_NEGATIVE", IntegerType(), True),
          StructField("count_RELIGION", IntegerType(), True),
          StructField("geo__coordinates", StringType(), True),
          StructField("location__country_code", StringType(), True),
          StructField("location__displayName", StringType(), True),
          StructField("location__name", StringType(), True),
          StructField("location__twitter_place_type", StringType(), True),
          StructField("gnip__klout_score", StringType(), True),
          StructField("gnip__matching_rules", StringType(), True),
          StructField("twitter_entities__hashtags", StringType(), True),
          StructField("twitter_entities__user_mentions", StringType(), True),
          StructField("twitter_lang", StringType(), True),
          StructField("count_NOISE", IntegerType(), True)
        ] 

    schema0 = StructType(fields0) 
    
    #convert RDD object to Spark dataframe
    df_flat = sqlContext.createDataFrame(
        df_flat,   
        schema0
    )
    
    
    #register RDDs as SQL table
    df.registerTempTable("originalTable")    
    df_flat.registerTempTable("myDFTable") #extracted details from data column

    #inner join tables
    joined_dataframe = sqlContext.sql("SELECT a.*, b.sentiment, b.politicalParty,b.civilEvents, b.`key:id` \
                                        FROM myDFTable a, originalTable b \
                                        WHERE a.id=b.`key:id`")
    

    get_posSentiment = udf(get_pos_sentiment, IntegerType())
    joined_dataframe = joined_dataframe.withColumn("positiveSentiment",get_posSentiment(joined_dataframe.sentiment))


    get_negSentiment = udf(get_neg_sentiment, IntegerType())
    joined_dataframe = joined_dataframe.withColumn("negativeSentiment",get_negSentiment(joined_dataframe.sentiment))
    
    
    #binary variable for political party presence
    udfScoreToCategory = udf(polilicatPartyToCategory, StringType())
    joined_dataframe = joined_dataframe.withColumn("has_politicalparty", udfScoreToCategory(joined_dataframe.politicalParty))
    
    
    #create new dataframe with outer_dict timestamp and join on key:id
    #df_key = df.select('key:id', 'key:timestamp', 'civilEvents')
    #myDF_join = joined_dataframe.join(df_key, joined_dataframe['key:id'] == df_key['key:id'], 'inner')
    
    
    #filter final dataframe to remove any tweets with a count of 'NOISE' words > 0
    #myDF_final = myDF_join.where(col('count_NOISE') == 0)
    myDF_final = joined_dataframe.where(col('count_NOISE') == 0)
       
    print('Filtered size of dataset: {0}'.format(str(myDF_final.count())))
    #select columns to drop
#     myDF_final = myDF_final.drop(myDF_final.sentiment)
    myDF_final = myDF_final.select([c for c in myDF_final.columns if c not in {'sentiment','key:id'}])

    myDF_final.printSchema()    
    
    return myDF_final

    
print(time.time())
#%time myDF = create_dataframe(startTime, endTime)
myDF = create_dataframe(startTime, endTime)
print(time.time())


1469684825.1124685
Original size of dataset: 417106
Filtered size of dataset: 407
1469685031.4002376


In [42]:
'''This cell selects tweets whose country name under profileLocations is Australia and whose twitter_lang is en'''

from IPython import display
import json

startTime = "2015-07-17T05:00:00+00:00"
endTime   = "2015-07-17T11:00:00+00:00"

df = sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.tweets", 
               startTime = startTime, 
               endTime   = endTime)

df.cache()
print('Original size of dataset: ' + str(df.count()))

#df_flat2 = df.map(lambda row: json.loads(row.data))
#df_flat2 = df.rdd
#df_flat2 = df.select(df.data)
#df_flat2.printSchema()
#df_flat2.show(truncate=False)
#print("df_flat2-->", str(df_flat2.count()))

rdd = df.map(lambda row: json.loads(row.data))\
    .filter(lambda row: ("gnip" in row and "profileLocations" in row["gnip"] and row["gnip"]["profileLocations"][0]["address"]["country"]=="Australia" and row["twitter_lang"]=="en"))\
    .map(lambda row: (row["id"], row["gnip"]["profileLocations"][0]["address"]["country"], row["twitter_lang"], row["body"]))

print("Filtered size:", rdd.count())
display.display(rdd.take(10))


Original size of dataset: 417106
Filtered size: 15329


[('tag:search.twitter.com,2005:621907186264993792',
  'Australia',
  'en',
  'Another Flaming Coals Portable Camping Spit Roaster in action over a home-made bricked in fire pit. Great example... http://t.co/FXPayn6QC9'),
 ('tag:search.twitter.com,2005:621907199875485697',
  'Australia',
  'en',
  "Say 'NO!' to racism. Say 'NO!' to 'Reclaim Australia'. Rally at 10am Sat July 18 @ Vic Parliament #NoRoomForRacism\n http://t.co/VJqtC6q5rS"),
 ('tag:search.twitter.com,2005:621907336790052865',
  'Australia',
  'en',
  'RT @renew_economy: New China data shows how Australia’s #coal industry is at risk http://t.co/5Fdednk7CW #auspol'),
 ('tag:search.twitter.com,2005:621907515542933504',
  'Australia',
  'en',
  'Good on @EwenJonesMP "I would never speak at a Reclaim Australia rally. I believe that whatever people bring to this country, they add to it'),
 ('tag:search.twitter.com,2005:621907593653432320',
  'Australia',
  'en',
  "RT @ReclaimWhatNet: Say 'NO!' to racism. Say 'NO!' to 'Reclaim A

In [43]:
'''This cell differs from the above cell only in the country filter condition. 
That is, we will select a tweet if its country_code of the check-in info is Australia.
If check-in is not available, we just the profileLocations info as in the previous cell.
'''


from IPython import display
import json

startTime = "2015-07-17T05:00:00+00:00"
endTime   = "2015-07-17T11:00:00+00:00"

df = sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.tweets", 
               startTime = startTime, 
               endTime   = endTime)

df.cache()
print('Original size of dataset: ' + str(df.count()))

#df_flat2 = df.map(lambda row: json.loads(row.data))
#df_flat2 = df.rdd
#df_flat2 = df.select(df.data)
#df_flat2.printSchema()
#df_flat2.show(truncate=False)
#print("df_flat2-->", str(df_flat2.count()))

rdd = df.map(lambda row: json.loads(row.data))\
    .filter(lambda row: ((row["location"]["country_code"]=="Australia" if ("location" in row and "country_code" in row["location"]) else ("gnip" in row and "profileLocations" in row["gnip"] and row["gnip"]["profileLocations"][0]["address"]["country"]=="Australia")) and row["twitter_lang"]=="en"))\
    .map(lambda row: [row["id"]] + ([row["location"]["country_code"]] if ("location" in row and "country_code" in row["location"]) else [row["gnip"]["profileLocations"][0]["address"]["country"]]) + [row["twitter_lang"], row["body"]])

print("Filtered size:", rdd.count())
display.display(rdd.take(10))

Original size of dataset: 417106
Filtered size: 15477


[['tag:search.twitter.com,2005:621907186264993792',
  'Australia',
  'en',
  'Another Flaming Coals Portable Camping Spit Roaster in action over a home-made bricked in fire pit. Great example... http://t.co/FXPayn6QC9'],
 ['tag:search.twitter.com,2005:621907199875485697',
  'Australia',
  'en',
  "Say 'NO!' to racism. Say 'NO!' to 'Reclaim Australia'. Rally at 10am Sat July 18 @ Vic Parliament #NoRoomForRacism\n http://t.co/VJqtC6q5rS"],
 ['tag:search.twitter.com,2005:621907336790052865',
  'Australia',
  'en',
  'RT @renew_economy: New China data shows how Australia’s #coal industry is at risk http://t.co/5Fdednk7CW #auspol'],
 ['tag:search.twitter.com,2005:621907515542933504',
  'Australia',
  'en',
  'Good on @EwenJonesMP "I would never speak at a Reclaim Australia rally. I believe that whatever people bring to this country, they add to it'],
 ['tag:search.twitter.com,2005:621907593653432320',
  'Australia',
  'en',
  "RT @ReclaimWhatNet: Say 'NO!' to racism. Say 'NO!' to 'Reclaim A

In [66]:
'''This cell builds inverted lists for each keyword in KEYWORDS'''

KEYWORDS=["protest","muslim","islam","outcry","asylum","organisation","threat","union","centrelink","opposition","parade","council",
"federal","strike","harass","refugee","riot","community","reclaim","poster","demonstration","petition","funding","barrier","march",
"crowd","celebration","action","barricade","placard","gather","resident","patriot","bigot","racism","national","decision","movement",
"mentality","racist","agency","mosque","highlight","halaal","turmoil","activist","disturbance","victory","equality","blockade","anger",
"ideal","unite","extremist","anzac","rally","culture","unrest","terror","terrorist"]


'''use a given tweet to update the dictionary, which is used as the inverted index
'''
def buildInvertedIndex(dictionary, tid, tweetBody):
    import re
    keys = re.findall(r"[\w']+", tweetBody)
    keys = tuple(map(lambda x: x.lower(), keys))
    #print("keys:", keys)
    tid = tid.split(":")[2] # remove the prefix string at the beginning of each original tweet id like "tag:search.twitter.com,2005:621952279726305284"  
    for key in keys:
        if key in dictionary:
            dictionary[key].add(tid)
    return keys

#example 1
invertedLists = {key:set() for key in KEYWORDS}
buildInvertedIndex(invertedLists, "111:11:1111111", "we ARe funding unite  tHe ,, best 3person. #hashtag haha3... :)")
buildInvertedIndex(invertedLists, "222:222:2222222", "we ARe funding unite bigot  the ,, best 3person. #hashtag haha3... :)")
print(invertedLists, "\n\n")

#example 2
invertedLists = {key:set() for key in KEYWORDS}
collectedTweets = rdd.map(lambda row: (row[0], row[3])).collect()
for tweet in collectedTweets:
    buildInvertedIndex(invertedLists, tweet[0], tweet[1])

# print the inverted lists
print(list(invertedLists.items())[:3], "\n") # now only print the first 3 keywords and their tids for reading purpose

# print # of occurrences for each keyword
stats = ""
for key,tidSet in invertedLists.items():
    stats += (key + ":" + str(len(tidSet)) + "  ")
print(stats)


{'movement': set(), 'culture': set(), 'harass': set(), 'victory': set(), 'equality': set(), 'threat': set(), 'terrorist': set(), 'ideal': set(), 'gather': set(), 'bigot': {'2222222'}, 'strike': set(), 'opposition': set(), 'racist': set(), 'activist': set(), 'unrest': set(), 'blockade': set(), 'protest': set(), 'agency': set(), 'patriot': set(), 'funding': {'2222222', '1111111'}, 'unite': {'2222222', '1111111'}, 'petition': set(), 'racism': set(), 'centrelink': set(), 'council': set(), 'resident': set(), 'union': set(), 'refugee': set(), 'extremist': set(), 'parade': set(), 'highlight': set(), 'celebration': set(), 'anger': set(), 'crowd': set(), 'poster': set(), 'action': set(), 'barricade': set(), 'organisation': set(), 'anzac': set(), 'halaal': set(), 'federal': set(), 'march': set(), 'decision': set(), 'riot': set(), 'terror': set(), 'muslim': set(), 'placard': set(), 'disturbance': set(), 'rally': set(), 'islam': set(), 'mosque': set(), 'community': set(), 'demonstration': set(), '

# PART 02 - D2D GSR Analysis

# GSR Extract

In [10]:



#pyspark imports
from pyspark import SQLContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import udf

#text analytics imports
import nltk
# nltk.download()
from nltk.corpus import stopwords

#generic imports
import json
import pandas
import re


#regex patterns
problemchars = re.compile(r'[\[=\+/&<>;:!\\|*^\'"\?%$.@)°#(_\,\t\r\n0-9-—\]]')
url_finder = re.compile(r'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emojis = re.compile(u'['
    u'\U0001F300-\U0001F64F'
    u'\U0001F680-\U0001F6FF'
    u'\u2600-\u26FF\u2700-\u27BF]+', 
    re.UNICODE)
stop = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
username = re.compile(r'(@)\w+( )')
# hashtag = re.compile(r'#(\w+)')           #maintain hashtags as a 'common word'
redate = re.compile(r'^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$')



# sql = SQLContext(sc) 

startTime = "2015-01-01T00:00:00+00:00"
endTime   = "2016-05-31T00:00:00+00:00"



#function to filter as RDD and output as Dataframe
def create_dataframe_gsr(startTime, endTime):
    
    
    df_gsr = sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.gsr", 
                   startTime = startTime, 
                   endTime   = endTime)
    
    print('Original size of dataset: ' + str(df_gsr.count()))
    
    
    #filter and select from original dataframe - Australian stories only
    df_gsr_filt = df_gsr.select(['key:id',
                                 'key:eventDate',
                                 'eventType',
                                 'populationGroup',
                                 'newsSourceName',
                                 'country',
                                 'state',
                                 'city',
                                 'crowdSize',
                                 'isViolent',
                                 'englishHeadline',
                                 'eventDescription'
                                ])\
    .filter(df_gsr.country == 'Australia')\
    .dropna()
    
    print('Filtered size of dataset: ' + str(df_gsr_filt.count()))

    
    #convert to pandas and 'clean' text
    df_gsr_pandas = df_gsr_filt.toPandas()

    df_gsr_pandas['eventDescription_clean'] = df_gsr_pandas['eventDescription'].map(lambda w: stop.sub('', problemchars.sub('', emojis.sub('', url_finder.sub('', username.sub('', w.lower().strip()))))))

    
    #convert back to dataframe
    df_gsr_clean = sqlContext.createDataFrame(df_gsr_pandas)
           

    df_gsr_clean.printSchema()
    
    
    return df_gsr_clean
    
    

%time myDF_GSR = create_dataframe_gsr(startTime, endTime)


myDF_GSR.toPandas().head(10)

Original size of dataset: 3364
Filtered size of dataset: 588
root
 |-- key:id: long (nullable = true)
 |-- key:eventDate: date (nullable = true)
 |-- eventType: string (nullable = true)
 |-- populationGroup: string (nullable = true)
 |-- newsSourceName: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- crowdSize: string (nullable = true)
 |-- isViolent: boolean (nullable = true)
 |-- englishHeadline: string (nullable = true)
 |-- eventDescription: string (nullable = true)
 |-- eventDescription_clean: string (nullable = true)

CPU times: user 364 ms, sys: 20 ms, total: 384 ms
Wall time: 21.6 s


Unnamed: 0,key:id,key:eventDate,eventType,populationGroup,newsSourceName,country,state,city,crowdSize,isViolent,englishHeadline,eventDescription,eventDescription_clean
0,47107655216,2015-10-30,Other Social Disruption,Labour,ABC,Australia,WA,Geraldton,small,False,Inmates end rooftop protest at WA prison as un...,Six prisoners at Greenough Regional Prison in ...,six prisoners greenough regional prison gerald...
1,25317655224,2015-10-12,Other Social Disruption,Medical,Greenleft,Australia,VIC,Geelong,small,False,Protest targets Geelong mayor's sexism,A rally was held in Geelong on October 12 to p...,rally held geelong october protest sexism cit...
2,28725655224,2015-07-19,Other Social Disruption,General,SMH,Australia,QLD,Brisbane,large,False,Police separate protesters as Brisbane Reclaim...,\nA strong police presence is separating vocal...,strong police presence separating vocal antiis...
3,64114655216,2015-11-12,Employment and Wages,Labour,ABC,Australia,SA,Kilburn,small,False,TWU protesters threatened with police action w...,A small Transport Workers Union (TWU) rally ta...,small transport workers union twu rally target...
4,84620655224,2015-06-13,Other Social Disruption,General,The age,Australia,VIC,Melbourne,unknown,False,Melbourne marriage equality rally: Love and po...,Human rights commissioner and freedom enthusia...,human rights commissioner freedom enthusiast t...
5,260213825240,2016-02-12,Other Government and Political Issues,General,ABC News,Australia,QLD,Brisbane,large,False,Brisbane's Lady Cilento Children's Hospital re...,About 50 protesters gathered outside the hospi...,protesters gathered outside hospital friday ni...
6,311116825224,2016-01-12,Other Government and Political Issues,General,ABC News,Australia,NSW,Wollongong,large,False,Horde of Kiama residents protest proposed merg...,More than 500 residents packed the Kiama Pavil...,residents packed kiama pavilion tuesday night ...
7,349206825224,2016-02-03,Employment and Wages,Labour,The Sydney Morning Herald,Australia,NSW,Newcastle,small,False,Sacked crew on CSL Melbourne stir new shipping...,16 Crew members on board bulk carrier ship CSL...,crew members board bulk carrier ship csl melb...
8,399305825224,2016-03-02,Other Government and Political Issues,Medical,ABC News,Australia,VIC,Bendigo,large,False,Bendigo disability workers feel betrayed; rall...,Members of a health union rallied in Bendigo o...,members health union rallied bendigo wednesday...
9,491123825224,2016-01-18,Employment and Wages,Labour,ABC News,Australia,SA,Port Adelaide,unknown,False,Tugboats strike delays ships entering and leav...,Strike action by tugboat engineers has delayed...,strike action tugboat engineers delayed ships...


# Topic Modelling Alternative - Scikit
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html

In [11]:
from __future__ import print_function
from time import time
from itertools import chain

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups



# CHANGE PARAMETERS AS REQUIRED
n_samples = 100
n_features = 1000
#n_topics = 3
#n_top_words = 15
n_topics = 10
n_top_words = 30



# create sequence of strings to feed into SKlearn Topic Model
GSR_text = myDF_GSR.select(['eventDescription_clean']).toPandas()
GSR_text = list(chain.from_iterable(GSR_text.values.tolist()))


# output human readable topic content
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    


# create list of topic words for labelling
def list_top_words(model, feature_names, n_top_words):
    list_topic = []
    
    for topic_idx, topic in enumerate(model.components_):
        list_mini_topic = []

        for i in topic.argsort()[:-n_top_words - 1:-1]:
            list_mini_topic.append(feature_names[i])
        
        list_topic.append(list_mini_topic)
        
    return list_topic


    
# MODEL: LDA

print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(input='content',
                                #max_df=0.95, 
                                #min_df=2, 
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(GSR_text)


print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, 
                                max_iter=5,
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0).fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, 
                tf_feature_names, 
                n_top_words)



# build list of lists with top n topic words - used in labelling functions to follow
topic_words = list_top_words(lda, 
                tf_feature_names, 
                n_top_words)

Extracting tf features for LDA...
Fitting LDA models with tf features, n_samples=100 and n_features=1000...

Topics in LDA model:
Topic #0:
match camp roof richmond banner standoff police aboriginal heirisson united island dawn carried activists morning riot thursday collingwood antimuslim boys patriots friday people mcg set victim stop said centre respects
Topic #1:
protest people rally gathered protesters australia group government police workers parliament melbourne hundreds action outside union sydney house community day marched brisbane australian pay rallied members street federal streets strike
Topic #2:
minister prime malcolm turnbull women protest briggs animal maiden week protesters activist dutton friday text held peter act turnbulls abuse duttons road mr activists climbed education immigration melbournes group hit
Topic #3:
marriage hospital equality support rally brisbanes vote royal march na cleaners gathered perth colourful hey st sex crowd streets supporters catholic we

In [12]:
# Check the topic words
print(len(GSR_text))
print("nonzero values: %d" % tf.nnz)
print(tf)



588
nonzero values: 9277
  (0, 673)	1
  (0, 716)	1
  (0, 672)	2
  (0, 307)	1
  (0, 685)	1
  (0, 611)	1
  (0, 107)	1
  (1, 685)	1
  (1, 704)	1
  (1, 420)	1
  (1, 609)	2
  (1, 173)	1
  (1, 513)	1
  (1, 321)	1
  (1, 974)	1
  (1, 883)	1
  (1, 100)	1
  (1, 805)	1
  (1, 389)	1
  (1, 377)	1
  (2, 704)	1
  (2, 864)	1
  (2, 656)	1
  (2, 668)	1
  (2, 794)	1
  :	:
  (586, 993)	1
  (586, 739)	1
  (586, 216)	1
  (586, 252)	1
  (586, 635)	1
  (586, 363)	1
  (587, 685)	2
  (587, 173)	1
  (587, 688)	1
  (587, 710)	1
  (587, 148)	1
  (587, 534)	1
  (587, 142)	1
  (587, 669)	1
  (587, 172)	1
  (587, 860)	1
  (587, 642)	1
  (587, 879)	1
  (587, 76)	1
  (587, 601)	1
  (587, 536)	1
  (587, 407)	1
  (587, 790)	1
  (587, 491)	1
  (587, 515)	1


# Set Label based on number of words in Topic Model word lists

In [4]:
from pyspark.sql.functions import lit

def label_maker_topic(tokens,topic_word_list):
    twt_list = []
    for i in range(0, len(topic_word_list)):
        count = 0
        #print(topic_words[i])
        for tkn in tokens:
            if tkn in topic_word_list[i]:
                count += 1
        twt_list.append(count)
    
    return twt_list
        

%time topicWord=udf(lambda tkn: label_maker_topic(tkn,topic_words),  ArrayType(IntegerType(), True))
#topicWord=udf(lambda tkn: tkn, StringType())
#topicWord = udf(label_maker_topic(,topic_words),StringType())
myDF=myDF.withColumn("topic_word_count",topicWord(myDF.body_TOKENS))

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 27 ms


# Create label based on list of tweet topic word counts + feature with sum of topic words

In [5]:
import numpy as np

def class_label_maker(protest_word_count,topic_model_result):
    
    if protest_word_count >= 1 and np.mean(topic_model_result) > 0:
        return '1'
    else:
        return '0'


    
classLabel=udf(class_label_maker,StringType())
%time myDF=myDF.withColumn("class_label",classLabel(myDF.count_PROTEST,myDF.topic_word_count))

myDF.printSchema()

#myDF.select(myDF["body_TOKENS"],myDF["count_PROTEST"],myDF["topic_word_count"],myDF["class_label"]).show()
#myDF.groupBy('body_TOKENS','count_PROTES','topic_word_count','class_label').count().show()
# myDF.groupBy('class_label').count().show()

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 32.6 ms
root
 |-- id: string (nullable = true)
 |-- actor__id: string (nullable = true)
 |-- actor__displayName: string (nullable = true)
 |-- actor__favoritesCount: string (nullable = true)
 |-- actor__followersCount: string (nullable = true)
 |-- actor__friendsCount: string (nullable = true)
 |-- actor__listedCount: string (nullable = true)
 |-- retweetCount: string (nullable = true)
 |-- actor__postedTime: string (nullable = true)
 |-- body: string (nullable = true)
 |-- body_TOKENS: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- count_PROTEST: integer (nullable = true)
 |-- count_CITY: integer (nullable = true)
 |-- has_DATE: integer (nullable = true)
 |-- count_DOW: integer (nullable = true)
 |-- count_ACTION: integer (nullable = true)
 |-- count_NEGATIVE: integer (nullable = true)
 |-- count_RELIGION: integer (nullable = true)
 |-- geo__coordinates: string (nullable = true)
 |-- location__count

KeyboardInterrupt: 

# Logistic Regression - Spark 

In [6]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

In [8]:
def DF_creator(myDF_split, variable_list):
    
    myDF_split_XY = myDF_split.select(variable_list)
    
    return myDF_split_XY
    


# list of PREDICTOR VARIABLES
variables =  ['actor__favoritesCount',
                'actor__followersCount',
                'actor__friendsCount',
                'actor__listedCount',
                'retweetCount',
                'count_PROTEST',
                'count_CITY',
                'has_DATE',
                'count_DOW', 
                'count_ACTION',
                'count_NEGATIVE',
                'count_RELIGION',
                'gnip__klout_score',
                'positiveSentiment',
                'negativeSentiment',
                'has_politicalparty',
                'class_label']

###TRAIN DATASET
myDF_selected = DF_creator(myDF, variables)

In [9]:
myDF_selected.printSchema()

root
 |-- actor__favoritesCount: string (nullable = true)
 |-- actor__followersCount: string (nullable = true)
 |-- actor__friendsCount: string (nullable = true)
 |-- actor__listedCount: string (nullable = true)
 |-- retweetCount: string (nullable = true)
 |-- count_PROTEST: integer (nullable = true)
 |-- count_CITY: integer (nullable = true)
 |-- has_DATE: integer (nullable = true)
 |-- count_DOW: integer (nullable = true)
 |-- count_ACTION: integer (nullable = true)
 |-- count_NEGATIVE: integer (nullable = true)
 |-- count_RELIGION: integer (nullable = true)
 |-- gnip__klout_score: string (nullable = true)
 |-- positiveSentiment: integer (nullable = true)
 |-- negativeSentiment: integer (nullable = true)
 |-- has_politicalparty: string (nullable = true)
 |-- class_label: string (nullable = true)



# Split Data

In [None]:
train_data_set, test_data_set =  myDF_selected.randomSplit([0.8, 0.2])

train_count = train_data_set.count()
print('Training set:'+str(train_count))
test_count = test_data_set.count()
print('Testing set:'+str(test_count))

print('Training dataset label profile:')
train_data_set.groupBy('class_label').count().show()
    
print('Test dataset label profile:')
test_data_set.groupBy('class_label').count().show()


# Convert Data - Label Point

In [11]:
# Load and parse the data
def parsePoint(dfline):
    
    return LabeledPoint(dfline[16], dfline[:16])

DF_training = train_data_set.map(lambda line: parsePoint(line))
DF_testing = test_data_set.map(lambda line: parsePoint(line))

# Model Creation

In [None]:
%time modelLR = LogisticRegressionWithLBFGS.train(DF_training,
                                           intercept=1)

print('Learned LogisticRegressionModel:')
print('\t Intercept: %g' % modelLR.intercept)
print('\t Feature\tWeight')
for i in range(len(variables)-1):
    print('\t %s\t\t%g' % (variables[i], modelLR.weights[i]))

In [None]:
modelLR.save(sc, "INFS5099")
modelLR_load = LogisticRegressionModel.load(sc, "INFS5099")

In [24]:
# Evaluating the model on training data
labelsAndPreds = DF_testing.map(lambda p: (p.label, float(modelLR.predict(p.features))))

In [25]:
labelsAndPreds.take(20)

[(0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0)]

In [None]:
predictions  = labelsAndPreds.collect()

In [26]:
print("Summary Statistics for the Overall Model \n")
accuracy = 1.0 * labelsAndPreds.filter(lambda result_line: result_line[0] == result_line[1]).count() / test_count #DF_testing.count()
print("Accuracy of model = %0.3f" %accuracy)
trainErr = labelsAndPreds.filter(lambda result_line: result_line[0] != result_line[1]).count() / test_count #float(DF_testing.count())
print("Training Error = %0.3f" %trainErr)

Summary Statistics for the Overall Model 

Accuracy of model = 0.986
Training Error = 0.014


In [22]:
modelLR.threshold

0.5

# Validation

In [19]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils

In [20]:
metrics_1 = BinaryClassificationMetrics(labelsAndPreds)

# Area under precision-recall curve
print("Area under PR = %0.3f" % metrics_1.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %0.3f" % metrics_1.areaUnderROC)

Area under PR = 0.835
Area under ROC = 0.654


In [21]:
matrix = MulticlassMetrics(labelsAndPreds)
matrix.confusionMatrix().toArray()

array([[ 35.,  12.],
       [ 41.,  53.]])

In [23]:
precision = matrix.precision()
recall = matrix.recall()
f1Score = matrix.fMeasure()
print("Precision = %0.3f" % precision)
print("Recall = %0.3f" % recall)
print("F1 Score = %0.3f" % f1Score)

Precision = 0.624
Recall = 0.624
F1 Score = 0.624


In [30]:
modelLR.clearThreshold()
labelAndProbs = DF_testing.map(lambda p: (p.label, modelLR.predict(p.features)))

In [32]:
labelAndProbs.take(20)

[(0.0, 0.0007813295136940701),
 (0.0, 0.008161574838311493),
 (0.0, 0.010885576908843097),
 (0.0, 0.006490605225947802),
 (0.0, 0.006086685443789945),
 (0.0, 0.0019480959456905694),
 (0.0, 0.04057505796660552),
 (0.0, 0.0001664672902922221),
 (0.0, 0.07969214033875202),
 (0.0, 0.011476845996616981),
 (0.0, 0.0013084367570740038),
 (0.0, 0.00215536471832972),
 (0.0, 0.001815567497571877),
 (0.0, 0.04813760460406153),
 (0.0, 0.04827637919284899),
 (0.0, 0.0015965951682102878),
 (0.0, 0.009361603253084263),
 (0.0, 0.006397516737990364),
 (0.0, 0.022186662136991626),
 (0.0, 0.028314095751258126)]

In [31]:
probabilities = labelAndProbs.collect()

In [33]:
probabilities

[(0.0, 0.0007813295136940701),
 (0.0, 0.008161574838311493),
 (0.0, 0.010885576908843097),
 (0.0, 0.006490605225947802),
 (0.0, 0.006086685443789945),
 (0.0, 0.0019480959456905694),
 (0.0, 0.04057505796660552),
 (0.0, 0.0001664672902922221),
 (0.0, 0.07969214033875202),
 (0.0, 0.011476845996616981),
 (0.0, 0.0013084367570740038),
 (0.0, 0.00215536471832972),
 (0.0, 0.001815567497571877),
 (0.0, 0.04813760460406153),
 (0.0, 0.04827637919284899),
 (0.0, 0.0015965951682102878),
 (0.0, 0.009361603253084263),
 (0.0, 0.006397516737990364),
 (0.0, 0.022186662136991626),
 (0.0, 0.028314095751258126),
 (0.0, 0.001995103641520666),
 (0.0, 0.0016833893445857048),
 (0.0, 0.013242391553350016),
 (0.0, 0.005417573190981347),
 (0.0, 0.2289518541745043),
 (0.0, 0.007765820243489427),
 (0.0, 0.029192512677676256),
 (0.0, 0.006386840310710083),
 (0.0, 0.0030813686576654188),
 (0.0, 0.0026606551435090914),
 (0.0, 0.0008072119625092177),
 (0.0, 0.03926393067552028),
 (0.0, 0.00046524140243617767),
 (0.0, 

In [48]:
probabilities,labelsAndPreds

#myDF_GSR.toPandas().to_csv('d2d_gsr_LABEL_extract(17Jul15).csv')
#myDF.toPandas().to_csv('d2d_tweet_LABEL_extract(17Jul15).csv')

([(0.0, 0.0007813295136940701),
  (0.0, 0.008161574838311493),
  (0.0, 0.010885576908843097),
  (0.0, 0.006490605225947802),
  (0.0, 0.006086685443789945),
  (0.0, 0.0019480959456905694),
  (0.0, 0.04057505796660552),
  (0.0, 0.0001664672902922221),
  (0.0, 0.07969214033875202),
  (0.0, 0.011476845996616981),
  (0.0, 0.0013084367570740038),
  (0.0, 0.00215536471832972),
  (0.0, 0.001815567497571877),
  (0.0, 0.04813760460406153),
  (0.0, 0.04827637919284899),
  (0.0, 0.0015965951682102878),
  (0.0, 0.009361603253084263),
  (0.0, 0.006397516737990364),
  (0.0, 0.022186662136991626),
  (0.0, 0.028314095751258126),
  (0.0, 0.001995103641520666),
  (0.0, 0.0016833893445857048),
  (0.0, 0.013242391553350016),
  (0.0, 0.005417573190981347),
  (0.0, 0.2289518541745043),
  (0.0, 0.007765820243489427),
  (0.0, 0.029192512677676256),
  (0.0, 0.006386840310710083),
  (0.0, 0.0030813686576654188),
  (0.0, 0.0026606551435090914),
  (0.0, 0.0008072119625092177),
  (0.0, 0.03926393067552028),
  (0.0,

In [None]:
#def toCSVLine(data):
 # return ','.join(str(d) for d in data)

#lines = probabilities.map(toCSVLine)
#lines.saveAsTextFile('probabilities.csv')

In [49]:
labelsAndPreds

PythonRDD[1149] at RDD at PythonRDD.scala:53

In [59]:
myDF_GSR.toPandas().to_csv('d2d_gsr_LABEL_extract(28may16-myDF).csv')
myDF.toPandas().to_csv('d2d_tweet_LABEL_extract(28may16-myDF).csv')

In [57]:
myDF_selected.toPandas().to_csv('d2d_gsr_LABEL_extract(28may16).csv')
myDF_selected.toPandas().to_csv('d2d_tweet_LABEL_extract(28may16).csv')

# Logistic Regression - Sklearn

In [34]:
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score)
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import chi2

# #Split Data

In [35]:
def train_test_splitter(myDF, n_train, n_test):
    
    myDF_train, myDF_test = myDF.randomSplit([n_train, n_test], seed=0)
                                                                                   
                                              
    #print('Size of training dataset by labelled data: ' + str(myDF_train.count()))
    #print('Size of test dataset by labelled data: ' + str(myDF_test.count()))
       
    #print('Training dataset label profile:')
    #myDF_train.groupBy('class_label').count().show()
    
    #print('Test dataset label profile:')
    #myDF_test.groupBy('class_label').count().show()
                         
    return myDF_train, myDF_train
                                              

%time myDF_train, myDF_test = train_test_splitter(myDF, 0.6, 0.4)
                                              


CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 32 ms


In [37]:
myDF_train.show()
#myDF_test.groupBy('class_label').count()

+--------------------+--------------------+--------------------+---------------------+---------------------+-------------------+------------------+------------+--------------------+--------------------+--------------------+-------------+----------+--------+---------+------------+--------------+--------------+--------------------+----------------------+---------------------+---------------+----------------------------+-----------------+--------------------+--------------------------+-------------------------------+------------+-----------+--------------+-----------+-----------------+-----------------+------------------+------------------+-----------+
|                  id|           actor__id|  actor__displayName|actor__favoritesCount|actor__followersCount|actor__friendsCount|actor__listedCount|retweetCount|   actor__postedTime|                body|         body_TOKENS|count_PROTEST|count_CITY|has_DATE|count_DOW|count_ACTION|count_NEGATIVE|count_RELIGION|    geo__coordinates|location__c

In [38]:
# FUCNTION TO SPLIT X and Y variables for LOGIT
def X_Y_creator(myDF_split, variable_list):
    
    myDF_split_XY = myDF_split.select(variable_list)
    
    #myDF_split_XY = myDF_split_XY.toPandas().as_matrix()
    
    return myDF_split_XY
    
def convert_pd(df):
    myDF_split_XY = df.toPandas().as_matrix()
    
    return myDF_split_XY

# list of PREDICTOR VARIABLES
variables_x =  ['actor__favoritesCount',
                'actor__followersCount',
                'actor__friendsCount',
                'actor__listedCount',
                'retweetCount',
                'count_PROTEST',
                'count_CITY',
                'has_DATE',
                'count_DOW', 
                'count_ACTION',
                'count_NEGATIVE',
                'count_RELIGION',
                'gnip__klout_score',
                'has_politicalparty',
                'positiveSentiment',
                'negativeSentiment',
                'has_politicalparty']

###TRAIN DATASET
#myDF_train_x = X_Y_creator(myDF_train, variables_x)
myDF_train = convert_pd(train_data_set)

#myDF_train_y = X_Y_creator(myDF_train, ['class_label'])

### TEST DATASET
#myDF_test_x = X_Y_creator(myDF_test, variables_x)

#myDF_test_y = X_Y_creator(myDF_test, ['class_label'])


###LOGISTIC REGRESSION TYPES - uncomment for standard vs. LASSO
#logit_1 = LogisticRegression(fit_intercept=True,
#                                          solver='sag'
#                                         )

###LASSO
###logit_1 = Lasso(fit_intercept=True )


###fit model to training data
#%time logit_1 = logit_1.fit(myDF_train_x, np.ravel(myDF_train_y))


###pedictions
#predicted = logit_1.predict(myDF_test_x)
#print(predicted)

###prediction probability - comment this out for LASSO
#probabilities = logit_1.predict_proba(myDF_test_x)      #printing 20 probabilities only
#probabilities_print = logit_1.predict_proba(myDF_test_x[0:20])      #printing 20 probabilities only
#print(probabilities_print)


### generate evaluation metrics
#print('\n\nEVALUATION METRICS')
#print('\nLogit COEFFICIENT OF DETERMINATION (R^2):\n')
### NOTE: If the chosen model fits worse than a horizontal line, then R2 is negative
#print(metrics.r2_score(np.ravel(myDF_test_y), predicted))
#print('\nLogit ACCURACY:\n')
#print(metrics.accuracy_score(np.ravel(myDF_test_y), predicted))
#print('\nLogit ROC/AUC SCORE:\n')                                        
#print(metrics.roc_auc_score(np.ravel(myDF_test_y), probabilities[:, 1]))
#print('\nLogit CONFUSION MATRIX:\n')
#print(metrics.confusion_matrix(np.ravel(myDF_test_y), predicted))
#print('\nLogit CLASSIFICATION REPORT:\n')
#print(metrics.classification_report(np.ravel(myDF_test_y), predicted))

###p-values - manual calculation as Scikit doesn't offer 'summary' output
#scores, pvalues = chi2(myDF_train_x, np.ravel(myDF_train_y))
#pvalues=["{0:.7f}".format(x)for x in pvalues] 

###examine coefficients
#logit_summary = pd.DataFrame(list(zip(variables_x, 
#                      np.transpose(logit_1.coef_),
#                      np.transpose(pvalues))))

#logit_summary.columns = ['variable', 'coefficient', 'p-value']

#logit_summary

In [None]:
#myDF_train_x.show()

# TEST DATA