In [1]:
import string
import json 
import re
import nltk
import pandas as pd
import datetime as dt
import time

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils

In [2]:
sqlContext = SQLContext(sc)

#global variables for the tokenize function 
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()
LEMMER = WordNetLemmatizer()
tweet_tokenizer = TweetTokenizer()

In [3]:
htf = HashingTF(50000) #vectorize the hash with 50k features. Determines the size of a sparse vector

# GSR Records

Load GSR records

In [4]:
#load gsr web article data in json format
def load_gsr_articles():
    #raw_data = sc.textFile("hdfs:///gsr.json") #includes months data 
    raw_data = sc.textFile("hdfs:///gsr3.json") #only english
    data = raw_data.map(lambda line: json.loads(line))
    return data

Following sampling methods were done to balance the population groups
If the population is balance there is no need of a sampling method. Data can be identified as labels and features

In [5]:
#The articles should be balanced for populations. 
#This function will up-sample the minority in gsr3 records. Where "general" and "labor" are a majority.
#minorityFraction was calculated using number of articles for 'labour' and 'education'.
#Becaues 'labour' has the lowest frequency in majority classes and 'education' has the highest of the minority classes
#This fraction will not oversample the minority class (similar case for down-sampling)
def upsample_minority(df):
    
    #Identify the labels and content for articles. 
    data_pared = df.map(lambda line: {'populationGroup': line['populationGroup'], 'article': line['article']})
    
    labour_class = data_pared.filter(lambda line: 'lab' in line['populationGroup'].lower()).count()
    edu_class = data_pared.filter(lambda line: 'education' in line['populationGroup'].lower()).count()
    
    minorityFraction = labour_class / float(edu_class)

    genaralSample = data_pared.filter(lambda line: 'general' in line['populationGroup'].lower())
    labourSample = data_pared.filter(lambda line: 'lab' in line['populationGroup'].lower())
    MajoritySample = genaralSample.union(labourSample)

    MinoritySample = data_pared.filter(lambda line: 'general' not in line['populationGroup'].lower())\
    .filter(lambda line: 'lab' not in line['populationGroup'].lower()).sample(withReplacement=True, fraction=minorityFraction)

    balancedData = MajoritySample.union(MinoritySample)
    return balancedData

In [6]:
#down sampling
#This function will down-sample the majority in gsr3 records. Where "general" and "labor" are a majority.
def downsample_majority(df):
    
    data_pared = df.map(lambda line: {'populationGroup': line['populationGroup'], 'article': line['article']})
    
    labour_class = data_pared.filter(lambda line: 'lab' in line['populationGroup'].lower()).count()
    edu_class = data_pared.filter(lambda line: 'education' in line['populationGroup'].lower()).count()
    
    majorityFraction = float(edu_class)/labour_class 

    genaralSample = data_pared.filter(lambda line: 'general' in line['populationGroup'].lower())\
                            .sample(withReplacement=True, fraction=majorityFraction)
    labourSample = data_pared.filter(lambda line: 'lab' in line['populationGroup'].lower())\
                            .sample(withReplacement=True, fraction=majorityFraction)
        
    MajoritySample = genaralSample.union(labourSample)

    MinoritySample = data_pared.filter(lambda line: 'general' not in line['populationGroup'].lower())\
    .filter(lambda line: 'lab' not in line['populationGroup'].lower())

    balancedData = MajoritySample.union(MinoritySample)

    return balancedData

Tokenize content to a bag of words, extract features. And label population groups neumerically

In [7]:
# Function to break text into "tokens", lowercase them, remove punctuation and stopwords, and lemmatize
def tokenize(text):
    #load nltk dictionaries to cluster. temporary 
    nltk.data.path.append("/local/hdfs-volume/data/nltk_data")
    
    tokens = word_tokenize(text)
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION and not letter.isdigit()])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    #stemmed = [STEMMER.stem(w) for w in no_stopwords]
    stemmed = [LEMMER.lemmatize(w) for w in no_stopwords]
    return [w for w in stemmed if w]


#label classes for Naive Bayse considering multiple targets
#all population groups should be converted to a numeric format to generate the sparce vector
#therefore, the 10 poppulation group were assigned with a number 1-10
def lable_classes(pop_item):
    #population = pop_item.lower()
    if "education" in pop_item.lower():
        pop_item = 1
    elif "general" in pop_item.lower():
        pop_item = 2
    elif "legal" in pop_item.lower():
        pop_item = 3
    elif re.compile('bus[iness|siness]').match(pop_item.lower()):
        pop_item = 4
    elif re.compile('eth[nic|ic]').match(pop_item.lower()):
        pop_item = 5
    elif "medical" in pop_item.lower():
        pop_item = 6  
    elif "religious" in pop_item.lower():
        pop_item = 7  
    elif re.compile('agricul[tural|ture]').match(pop_item.lower()):
        pop_item = 8
    elif re.compile('lab[or|our]').match(pop_item.lower()):
        pop_item = 9
    elif "media" in pop_item.lower():
        pop_item = 10
    return pop_item



#extract intrested features from article content for population group
#input: gsr article data in json format
#output: population group and article content
def tokenize_articles(data_paired):
    
    data_cleaned = data_paired.map(lambda pared_line: {'populationGroup': lable_classes(pared_line['populationGroup']),'article':tokenize(pared_line['article'])})
    return data_cleaned

# Twitter Data

Twitter feature extraction

In [8]:
#extract meaningful words as features from twitter
def feature_extraction(tokens):
    nltk.data.path.append("/local/hdfs-volume/data/nltk_data")
    
    lowercased = [t.lower() for t in tokens]
    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION and not letter.isdigit()])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]
    #stemmed = [STEMMER.stem(w) for w in no_stopwords]
    stemmed = [LEMMER.lemmatize(w) for w in no_stopwords]
    no_links = [w for w in stemmed if (not 'http' in w) and len(w)>2]
    return [w for w in no_links if w]

Twitter Data for Training

In [22]:
def load_tweets(tweets_data_path):
    
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            #print(tweet['text'])
            tweets_data.append(tweet)
        except:
            continue
    return tweets_data



#load labeled tweets
business_data = sc.parallelize(load_tweets("Tweets_NB/business_tweet.txt"))
agri_data = sc.parallelize(load_tweets("Tweets_NB/agri_tweet.txt"))
edu_data = sc.parallelize(load_tweets("Tweets_NB/education_tweet.txt"))
ethic_data = sc.parallelize(load_tweets("Tweets_NB/ethic_tweet.txt"))
labour_data = sc.parallelize(load_tweets("Tweets_NB/labour_tweet.txt"))
medical_data = sc.parallelize(load_tweets("Tweets_NB/medical_tweet.txt"))
religious_data = sc.parallelize(load_tweets("Tweets_NB/religious_tweet.txt"))
general_data = sc.parallelize(load_tweets("Tweets_NB/general_tweet.txt"))

In [23]:
#create a twitter dataset including labels and features for each tweet
agri_features = agri_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":8})
business_features = business_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":4})
edu_features = edu_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":1})
ethic_features = ethic_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":5})
labour_features = labour_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":9})
medical_features = medical_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":6})
religious_features = religious_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":7})
general_features = general_data.map(lambda row: row.get('text',None)).map(lambda line: tweet_tokenizer.tokenize(line))\
.map(lambda tokens: {"article":feature_extraction(tokens),"populationGroup":2})


labeled_twitter_data = business_features.union(edu_features.union(ethic_features.union(labour_features.union(agri_features))))
labeled_twitter_data  = labeled_twitter_data.union(general_features.union(medical_features.union(religious_features)))

In [24]:
labeled_twitter_data.count()

800

# Model 

In [12]:
#transform extracted features to a sparse vector (hashing)
#input: rdd of label(population group) and features(extracted words) for each article or tweet 
def transform_data(data_cleaned):
    #vectorize the hash with 50k features. Determines the size of a sparse vector
    #htf = HashingTF(50000)
    
    data_hashed = data_cleaned.map(lambda dict_line: LabeledPoint(dict_line['populationGroup'], htf.transform(dict_line['article'])))
    
    return data_hashed    

In [13]:
def train_model(model_data):

    model = NaiveBayes.train(model_data)
    return model


#inputs: train and test data (vector in label feture format)
#output: returns a list of labels predicted for tweets  (population group predicted, default label) 
def predict_tweets(train_data, test_data):
    
    NB_model = train_model(train_data)
    
    prediction_and_label = test_data.map(lambda p : (float(NB_model.predict(p.features)), p.label))
    
    return prediction_and_label

In [14]:
#obtain metrics based on model predictins
#input: model output of list with predicted and initial label
#output: metrics object
def get_model_metrics(predict_and_label):
    
    return MulticlassMetrics(predict_and_label)


#Obtain the confusion matrics in a array
#input: metric object created from model output
#output: confusion metrics
def get_confusionMatrix(metrics):
    
    return metrics.confusionMatrix().toArray()

In [15]:
#This function is to relabel the numeric class labels to user friendly format.
#all labels were transformed to a numeric format to generate the sparse vector to get the hash
def rename_label(label):
    
    if label==1:
        label="education"
    elif label==2:
        label="general"
    elif label==3:
        label="legal"
    elif label==4:
        label="business"
    elif label==5:
        label="ethnic"
    elif label==6:
        label="medical"
    elif label==7:
        label="religious"
    elif label==8:
        label="agricultural"
    elif label==9:
        label="labour"
    elif label==10:
        label="media"
        
    return label

In [4]:
def NaiveBayes_Model():
    
    data = load_gsr_articles()
    data_paired = upsample_minority(data)
    #data_paired = downsample_majority(data)
    data_cleaned = tokenize_articles(data_paired)
    
    #### method 1 ####
    #twitter_train_sample, twitter_test_sample = labeled_twitter_data.randomSplit([0.8,0.2])
    #data_set = data_cleaned.union(twitter_train_sample) 
    #train_data = transform_data(data_set)
    #test_data = transform_data(twitter_test_sample)
    
    #### method 2 ####
    twitter_data = transform_data(labeled_twitter_data) #only tweets as taining and test set
    train_data, test_data =  twitter_data.randomSplit([0.8, 0.2])
    
    
    #### method 3 ####
    #data_set = data_cleaned.union(labeled_twitter_data)   #combine gsr and tweets
    #data_set_hashed = transform_data(data_set)
        
    #train_data, test_data = data_set_hashed.randomSplit([0.8, 0.2])    #split training and test data
    
    
    
    #### method 4 ####
    #train_data=transform_data(data_cleaned)   #only gsr data as training set
    #test_data = transform_data(labeled_twitter_data)   #only tweets as test set
  
      
    ##################
        
    #apply hashed test and training data to the model
    prediction_label = predict_tweets(train_data, test_data)
    
    metrics = get_model_metrics(prediction_label)
    
    confusion_matrix = get_confusionMatrix(metrics)
    print(confusion_matrix)

    print("\n Summary Statistics for the Overall Model \n")
    accuracy = 1.0 * prediction_label.filter(lambda result_line: result_line[0] == result_line[1]).count() / test_data.count()
    print("Accuracy of model = %0.2f" %accuracy)
    
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)   
    
    labels = test_data.map(lambda lp: lp.label).distinct().collect()
    print("\n Summary Statistics for Each Tested Class \n")
    for label in sorted(labels):
        print("Class %s precision = %s" % (rename_label(label), metrics.precision(label)))
        print("Class %s recall = %s" % (rename_label(label), metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (rename_label(label), metrics.fMeasure(label, beta=1.0)))
        print("\n")

 

In [2]:
NaiveBayes_Model() # gsr+twitter / twitter

NameError: name 'load_gsr_articles' is not defined

In [3]:
NaiveBayes_Model() #twitter/twitter (including general)

NameError: name 'load_gsr_articles' is not defined

In [98]:
NaiveBayes_Model() #twitter/twitter (no general)

[[ 18.   0.   0.   2.   2.   2.   0.   0.]
 [  0.   0.   1.   0.   0.   0.   0.   0.]
 [  0.   0.  18.   1.   1.   0.   1.   2.]
 [  2.   0.   0.  18.   2.   0.   0.   0.]
 [  1.   0.   0.   1.  15.   0.   0.   0.]
 [  2.   0.   0.   2.   1.  15.   0.   1.]
 [  0.   0.   0.   0.   1.   0.  22.   1.]
 [  2.   0.   0.   1.   1.   0.   0.  15.]]

 Summary Statistics for the Overall Model 

Accuracy of model = 0.80
Precision = 0.8013245033112583
Recall = 0.8013245033112583
F1 Score = 0.8013245033112583

 Summary Statistics for Each Tested Class 

Class education precision = 0.72
Class education recall = 0.75
Class education F1 Measure = 0.7346938775510204


Class general precision = 0.0
Class general recall = 0.0
Class general F1 Measure = 0.0


Class business precision = 0.9473684210526315
Class business recall = 0.782608695652174
Class business F1 Measure = 0.8571428571428571


Class ethnic precision = 0.72
Class ethnic recall = 0.8181818181818182
Class ethnic F1 Measure = 0.765957446808

In [20]:
def NaiveBayes_Model04():
    
    data = load_gsr_articles()
    data_paired = upsample_minority(data)
    #data_paired = downsample_majority(data)
    data_cleaned = tokenize_articles(data_paired)
    
    #### method 1 ####
    #twitter_train_sample, twitter_test_sample = labeled_twitter_data.randomSplit([0.8,0.2])
    #data_set = data_cleaned.union(twitter_train_sample) 
    #train_data = transform_data(data_set)
    #test_data = transform_data(twitter_test_sample)
    
    #### method 2 ####
    #twitter_data = transform_data(labeled_twitter_data) #only tweets as taining and test set
    #train_data, test_data =  twitter_data.randomSplit([0.8, 0.2])
    
    
    #### method 3 ####
    data_set = data_cleaned.union(labeled_twitter_data)   #combine gsr and tweets
    data_set_hashed = transform_data(data_set)
        
    train_data, test_data = data_set_hashed.randomSplit([0.8, 0.2])    #split training and test data

      
    ##################
        
    #apply hashed test and training data to the model
    prediction_label = predict_tweets(train_data, test_data)
    
    metrics = get_model_metrics(prediction_label)
    
    confusion_matrix = get_confusionMatrix(metrics)
    print(confusion_matrix)

    print("\n Summary Statistics for the Overall Model \n")
    accuracy = 1.0 * prediction_label.filter(lambda result_line: result_line[0] == result_line[1]).count() / test_data.count()
    print("Accuracy of model = %0.2f" %accuracy)
    
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)   


In [22]:
NaiveBayes_Model04() #gsr+tweet/gsr+tweet

[[ 35.   5.   0.   5.   0.   0.   0.   0.]
 [  3.  12.   0.   0.   0.   0.   0.   0.]
 [  5.   1.  25.   1.   0.   1.   1.   2.]
 [  6.   8.   4.  18.   1.   1.   0.   2.]
 [  5.   4.   5.   0.  18.   1.   0.   2.]
 [  4.   4.   1.   0.   0.   9.   2.   0.]
 [  2.   1.   6.   1.   0.   0.  17.   1.]
 [  6.   5.   0.   0.   2.   0.   0.  28.]]

 Summary Statistics for the Overall Model 

Accuracy of model = 0.62
Precision = 0.6230769230769231
Recall = 0.6230769230769231
F1 Score = 0.6230769230769231


# Make Predictions

Tweets for predictions using the models

In [33]:
#class LoadTwitterData:
    
def get_data_bydate():
    day = dt.datetime(2015,10,10)
    day_ago = day - dt.timedelta(days=1)
    
    end_time = time.mktime(day.timetuple())
    start_time = time.mktime(day_ago.timetuple())
    
    return sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.tweets", startTimestamp = int(start_time), endTimestamp = int(end_time))

#returns tweets based on number of hours provided
def get_twitter_data(hrs):
    time_now = dt.datetime.now()
    hour_ago = time_now - dt.timedelta(hours=hrs)
    #print(time_now,hour_ago)

    end_time = time.mktime(time_now.timetuple())
    start_time = time.mktime(hour_ago.timetuple())
    #print(start_time,end_time)
    
    return sqlContext.read.load(format = "au.com.d2dcrc.carbon.spark.tweets", startTimestamp = int(start_time), endTimestamp = int(end_time))

In [34]:
#extract english tweet text from loaded json data
def extract_tweet_body(df):
    
    body_text = df.map(lambda row: row.data).map(lambda data: json.loads(data))\
    .filter(lambda line_tuple: 'en' in line_tuple['twitter_lang']).map(lambda line:line.get('body',None))
    
    return body_text   
    
    
#preperation of label feature structure using tweet features, for sparse vector
def extract_tweet_features(tweet_body):
    
    body_tokens = tweet_body.map(lambda line: tweet_tokenizer.tokenize(line)) #tokenize twitter text
    #extract twitter features and assigne a default population label
    twitter_features = body_tokens.map(lambda token_list: {"article":feature_extraction(token_list),"populationGroup":0}) 
   
    return twitter_features

In [35]:
#map labeled populations to text
def _rename_label(item):
    
    if item[0]==1:
        item[1]="education"
    elif item[0]==2:
        item[1]="general"
    elif item[0]==3:
        item[1]="legal"
    elif item[0]==4:
        item[1]="business"
    elif item[0]==5:
        item[1]="ethnic"
    elif item[0]==6:
        item[1]="medical"
    elif item[0]==7:
        item[1]="religious"
    elif item[0]==8:
        item[1]="agricultural"
    elif item[0]==9:
        item[1]="labour"
    elif item[0]==10:
        item[1]="media"
        
    return item


#format predicted labels (population groups) to text
def get_predictions(predictions):
    answer = predictions.map(lambda l: _rename_label(list(l))).map(lambda x: x[1])
    #answer = predictions.map(lambda l: _rename_label(list(l))).map(lambda x: tuple(x))#.zipWithIndex().map(lambda tuple_set: (tuple_set[1],tuple_set[0]))
    return answer

def display(tweet_body,indexed_answer):
    return tweet_body.zip(indexed_answer)

In [36]:
def Predict_tweets_main():
    
    data = load_gsr_articles()
    data_paired = upsample_minority(data)
    data_cleaned = tokenize_articles(data_paired)
    
    #### method 1 ####
    #predict tweets on model created with gasr + labeled tweets
    data_set = data_cleaned.union(labeled_twitter_data)   #combine gsr and labeled tweets
    train_data = transform_data(data_set)
    
    #### method 2 ####
    #### predict tweets on model created with twitter data
    #train_data = transform_data(labeled_twitter_data)
    
    df = get_twitter_data(1) #load tweets within past hour
    tweet_body = extract_tweet_body(df)
    twitter_data = extract_tweet_features(tweet_body)
    
    test_data = transform_data(twitter_data)
    
    predictions = predict_tweets(train_data, test_data)
    indexed_answer = get_predictions(predictions)

    joined_ans = display(tweet_body,indexed_answer)
    print(joined_ans.take(20))

In [130]:
Predict_tweets_main()

[('RT @smoshanthony: I voted for @mielmonster in the #ShortyAwards because she brings abundant happiness and laughter: https://t.co/UaC90gdazE', 'general'), ('RT @MaitlandBusines: Chamber Chat: Chamber season  kicks off with a big 2016 planned https://t.co/XP8OkNZWrs via @maitlandmercury', 'labour'), ('@TairyGreene01 that sounds to liberal for me.', 'agricultural'), ('LISTEN: @SenBrettMason has an #EqualityCalling msg from QLD: https://t.co/hMKMfYYB1j | Supporters call: 1300663679', 'business'), ('TWENTY WAN https://t.co/BoWAm4dE2a', 'medical'), ('RT @Andrew_Nelson9: Bendigo man Daniel Reimers has pleaded guilty to 28 charges. Case moves to County Court next month. @9NewsMelb https://…', 'legal'), ("RT @MicroSFF: Worried about Skynet's killer drones, he looked up 'how not be tagged as extremist' on the dark web.\nFirst tip was 'Stay off …", 'legal'), ('"Would a Federal VET funding system be good for TAFE?" by @paul_learning on @LinkedIn https://t.co/xRzGNV3Loz', 'legal'), ('@jonkudelka

In [96]:
Predict_tweets_main()

[("'American Crime Story' is expertly tackling the issue of fame https://t.co/x17guw5Ycq https://t.co/Z8OU0whTrl", 'education'), ('RT @MIB_India: Union Minister Shri @sarbanandsonwal &amp; Shri N Ramachandran hand over the flag of  SouthAsianGames at Closing ceremony #South…', 'business'), ('RT @MIB_India: Union Minister Shri @sarbanandsonwal &amp; Shri N Ramachandran hand over the flag of  SouthAsianGames at Closing ceremony #South…', 'business'), ('The Muses in Greek mythology, poetry, and literature, are the... https://t.co/EfbayNpQhI #concertfilms', 'medical'), ('RT @MIB_India: Union Minister Shri @sarbanandsonwal &amp; Shri N Ramachandran hand over the flag of  SouthAsianGames at Closing ceremony #South…', 'business'), ('Has Islamic State run out of cash? https://t.co/gdMerpWyMQ #Australia #business', 'ethnic'), ('RT @sunandavashisht: On JNU Protests @HarithaPusarla asks -Are they mere voices of dissent? Very well argued piece  https://t.co/LDhTC85HTH', 'labour'), ("#Wisconsin #Mi

Issues Related to Labeling Tweets Based on GSR

The predictions can be biased towards the training data set. Since the model is trained on words extracted from the training text it is likely to predict tweets which has similar words falling into that class. However, there can be situations where tweets describe a content which is relevant but having words out of the training corpus. 
Another issue is that the tweets which contain general context are predicted using web articles. Web articles considered are regarding certain events occurred. Eg: protest, rally etc. They are grouped to different population groups based on the people or the community who are related to the event. Eg: An article regarding a protest about medical issues conducted by doctors and medical staff will be labeled under “medical”. An article regarding a protest about medical issues conducted by people around the area or public will be labeled “general”. Since it is the addressing the general community.
The general content in a tweet can be informative or not informative (“just general content”). Considering tweet terms related to an event might not be enough to categorize it into a population group. Additionally the tweet should be analyzed to identify the people related to it. A text can be regarding a protest. However, that text falls under a population group based on the people who handled it.


# TF-IDF

Alternative method in building a frequency hash. Weigh the features based on the ocuurance of the words within the whole corpus.

In [27]:
from pyspark.mllib.feature import HashingTF, IDF
htf = HashingTF(50000)

def get_tfidf_data(label_features):
    
    labels = label_features.map(lambda doc: doc["populationGroup"])
    #htf = HashingTF(50000)
    tf = htf.transform(label_features.map(lambda doc: doc["article"]))
       
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    
    #Combine using zip
    data_set_hashed = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    
    return data_set_hashed

In [28]:
def main_tdif():
    
    data = load_gsr_articles()
    data_paired = upsample_minority(data)
    #data_paired = downsample_majority(data)
    data_cleaned = tokenize_articles(data_paired)
    #print(data_cleaned.take(2))
    
    #### method 1 ####
    train_twitter_data, test_twitter_data = labeled_twitter_data.randomSplit([0.8, 0.2])    #split training and test data
    train_set = data_cleaned.union(train_twitter_data)   #combine gsr and tweets
    
    train_data = get_tfidf_data(train_set)
    test_data =  get_tfidf_data(test_twitter_data)
    
    
    
    #### method 2 ####
    #train_data=get_tfidf_data(data_cleaned)   #only gsr data as training set
    #test_data = get_tfidf_data(labeled_twitter_data)   #only tweets as test set

   
    #### method 3 ####
    #data_set =  get_tfidf_data(data_cleaned)
    #train_data, test_data = data_set.randomSplit([0.8, 0.2])
             
    #apply hashed test and training data to the model
    prediction_label = predict_tweets(train_data, test_data)
    
    metrics = get_model_metrics(prediction_label)
    
    confusion_matrix = get_confusionMatrix(metrics)
    print(confusion_matrix)

    print("\n Summary Statistics for the Overall Model \n")
    accuracy = 1.0 * prediction_label.filter(lambda result_line: result_line[0] == result_line[1]).count() / test_data.count()
    print("Accuracy of model = %0.2f" %accuracy)
    
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)   
    
    labels = test_data.map(lambda lp: lp.label).distinct().collect()
    print("\n Summary Statistics for Each Tested Class \n")
    for label in sorted(labels):
        print("Class %s precision = %s" % (rename_label(label), metrics.precision(label)))
        print("Class %s recall = %s" % ((rename_label(label), metrics.recall(label))))
        print("Class %s F1 Measure = %s" % ((rename_label(label), metrics.fMeasure(label, beta=1.0))))
        print("\n")


In [30]:
main_tdif()

[[  8.   4.   0.   5.   0.   1.   1.   0.]
 [  4.   8.   0.   0.   2.   2.   3.   3.]
 [  2.   2.   6.   2.   0.   1.   1.   6.]
 [  2.   5.   0.   9.   1.   0.   0.   5.]
 [  1.   5.   2.   0.   7.   1.   0.   3.]
 [  2.   8.   1.   1.   0.   6.   0.   1.]
 [  1.   7.   0.   0.   2.   0.  10.   0.]
 [  2.   2.   1.   2.   0.   0.   0.   9.]]

 Summary Statistics for the Overall Model 

Accuracy of model = 0.40
Precision = 0.4012738853503185
Recall = 0.4012738853503185
F1 Score = 0.4012738853503185

 Summary Statistics for Each Tested Class 

Class education precision = 0.36363636363636365
Class education recall = 0.42105263157894735
Class education F1 Measure = 0.3902439024390244


Class general precision = 0.1951219512195122
Class general recall = 0.36363636363636365
Class general F1 Measure = 0.25396825396825395


Class business precision = 0.6
Class business recall = 0.3
Class business F1 Measure = 0.4


Class ethnic precision = 0.47368421052631576
Class ethnic recall = 0.409090909

# Predict Tweets

In [31]:
def main_tdif_test():
    
    data = load_gsr_articles()
    data_paired = upsample_minority(data)
    #data_paired = downsample_majority(data)
    data_cleaned = tokenize_articles(data_paired)
    
    #### method 1 ####
    data_set = data_cleaned.union(labeled_twitter_data)   #combine gsr and tweets
    train_data = get_tfidf_data(data_set)

    
    df = get_twitter_data(1)
    #df = get_data_bydate()
    tweet_body = extract_tweet_body(df)
    twitter_data = extract_tweet_features(tweet_body)
        
    test_data = get_tfidf_data(twitter_data)
    
    predictions = predict_tweets(train_data, test_data)
    indexed_answer = get_predictions(predictions)
        
    joined_ans = display(tweet_body,indexed_answer)
    print(joined_ans.take(20))

In [37]:
main_tdif_test()

[("@pnfinn @RLW_Mole @fitbit If Moley needed two hands he wouldn't be a journo.....", 'labour'), ("RT @JohnCleese: We're holding auditions and are stunned by just how many really good, funny Aussie performers there are.The FT tour opens A…", 'general'), ('Looks like Sarah.. got Busteed. https://t.co/1j3vOmGT7O https://t.co/81D5G9DGTc', 'agricultural'), ('RT @TooSexist: How unfortunate... http://t.co/0QDzYwLcBu', 'agricultural'), ("@msmaryandes I don't fully understand it myself, but basically the government pays for the bulk of most treatments, but some costs get thru.", 'general'), ('RT @MaryJeanAdams: Prudence has 3 months to find a husband... Willing Love #99cents #Romance https://t.co/FRvkFET7Bb', 'education'), ('RT @dick_nixon: If only Bush could get his hands on the Glengarry Highlands leads.', 'business'), ('RT @StopShenhua: "If mining damages the water supplies, there\'ll be no vote, &amp; no election, that can help us" #auspol #shenhua @nocsg https…', 'general'), ('RT @muniqui

# Binary Classification

In [None]:
#This labeling is needed only if NB binary classification is considered
#label classes for Naive Bayse considering education and all others
def lable_class_edu(item_pop):
    if item_pop.lower() == "education":
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering general population and all others
def lable_class_gen(item_pop):
    if "general" in item_pop.lower():
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering legal and all others
def lable_class_leg(item_pop):
    if "legal" in item_pop.lower():
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering ethnic and all others
def lable_class_eth(item_pop):
    if "ethnic" in pop_item.lower():
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering religious and all others
def lable_class_reli(item_pop):
    if "religious" in item_pop.lower():
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering media and all others
def lable_class_reli(item_pop):
    if "media" in item_pop.lower():
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering media and all others
def lable_class_reli(item_pop):
    if re.compile('agricul[tural|ture]').match(item_pop.lower()):
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

#label classes for Naive Bayse considering media and all others
def lable_class_labour(item_pop):
    if re.compile('lab[or|our]').match(item_pop.lower()):
        item_pop = 1
    else:
        item_pop = 2
    return item_pop

In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

def NaiveBayes_Binary_main():
    
    data = load_gsr_articles()
    data_cleaned = tokenize_articles_binary(data)
    hashed_data = transform_data(data_cleaned)
    
    train_hashed, test_hashed = hashed_data.randomSplit([0.8, 0.2])
    
    NB_model = train_model(train_hashed)
    #model.save(sc, "ModelPath")
    #sameModel = NaiveBayesModel.load(sc, "ModelPath")

    predictionAndLabel = test_hashed.map(lambda p : (float(NB_model.predict(p.features)), p.label))

    #accuracy = 1.0 * predictionAndLabel.filter(lambda result_line: result_line[0] == result_line[1]).count() / test_hashed.count()
    #print("Accuracy of model = %0.2f" %accuracy)
    
    #print(predictionAndLabel.take(5))
    metrics = BinaryClassificationMetrics(predictionAndLabel)
    
    print("Area under PR = %s" % metrics.areaUnderPR)
    print("Area under ROC = %s" % metrics.areaUnderROC)