In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import SGDClassifier
import sys
import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import twitterquery as tq

## Data Size Statistics 

In [3]:
directory = '/mnt/66e695cd-1a0c-4e3b-9a50-55e01b788529/Training_data/Staging_final/'
raw_data = tq.data.get_raw_data(spark, directory)

In [8]:
print("Number of Tweets: {:,}".format(raw_data.count()))

Number of Tweets: 135,910,871


In [13]:
topics = tq.utils.get_topics()
topics_human_readable = tq.utils.get_readable_topic()
topical_counts = pd.DataFrame(columns=['Topic', 'Positives', 'Total']).astype({'Positives': int, 'Total': int})
for topic in topics:
    labled_data = tq.data.load_labeled_data(spark, raw_data, topic)
    topical_counts = topical_counts.append({'Topic': topics_human_readable[topic] if topic in topics_human_readable else topic,
                                          'Positives': tq.data.get_num_of_positive_labels(labled_data),
                                          'Total': labled_data.count()}, ignore_index=True)

In [14]:
topical_counts

Unnamed: 0,Topic,Positives,Total
0,Natural Disaster,89440,13877076


## Labeling Hashtags

Some examples of the hashtags used for labeling each tweet as topical:

In [18]:
topics = tq.settings.TOPICS
topic_dict = tq.hashtag_dict.topic_dict
hashtags_pd = pd.DataFrame()
for topic in topics:
    hashtags = random.sample(topic_dict[topic], 10)
    hashtags = [h for h in hashtags]
    hashtags_row = pd.DataFrame(hashtags)
    hashtags_pd = pd.concat([hashtags_pd, hashtags_row], axis=1)
    
hashtags_pd.columns = [topics_human_readable[t] if t in topics_human_readable else t for t in topics]

In [19]:
hashtags_pd

Unnamed: 0,Natural Disaster,Social Issues,Space,Soccer,Human Disasters,Tennis,Health,LGBT
0,tsunami,aurarosser,lunar,usavsger,israel,wimbledonfinals,eplague,gaymoment
1,chileearthquake,ericgarner,planetsunburn,sportsbetting,igad,wimbledon2013,theplague,sacksheila
2,hurricanekatrina,debts,apollo11,worldcup2014brazil,jamesfoley,atpworldtourfinals,uniteblue,loveislove
3,sandy,moa,cometlanding,dortmund,freesyria,teamnovak,factsnotfear,p2
4,hurricanekid,thisstopstoday,planets,ballislife,malala,rafanadaltour,nursesfightebola,equalityformen
5,flood2013,mikebrown,antares,socce,antiwar,tenniscourt,humanitarianheroes,homos
6,tsunami2004,freetheweed,iris,mu,234whitegirls,cincytennis,depressionawareness,ccot
7,tsunami4nayapakistan,policelivesmatter,meteorjs,nufc,stayoutofsyria,teamdjokovic,publichealth,equalityforall
8,laquake,antoniomartin,science,soccergrlprobs,famine,frenchopen2013,stopebola,2a
9,earthquake,nonewnjgunlaws,meteors,league1,notinmyname,frenchopentennis,virus,acceptancematters


# Split and Save the Labeled Data

In [6]:
kfold_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/kfold/'
topics = ["Natr_Disaster"]#tq.settings.TOPICS
for topic in topics:
    labeled_data = tq.data.get_labeled_data(raw_data, topic)
    splitted_data = tq.data.split_kfold(labeled_data, k=10)
#     for i, split in enumerate(splitted_data):
#         split_pd = split.toPandas().dropna()
#         output_path = kfold_split_directory + topic + "/" + str(i+1) + ".csv"
#         split_pd.to_csv(output_path, encoding='utf-8', index=False)

In [7]:
#TODELETE
for i, split in enumerate(splitted_data):
    split_pd = split.toPandas().dropna()
    output_path = kfold_split_directory + topic + "/" + str(i+1) + ".csv"
    split_pd.to_csv(output_path, encoding='utf-8', index=False)

# Performance Results

## Natural Disaster

In [24]:
dat = tq.data.load_splitted_data("Soccer", num_of_splits=5, shuffle=True)

In [25]:
dat.shape

(15012414, 8)

In [16]:
dat.loc[:, ['tweet_id', 'label']]

Unnamed: 0,tweet_id,label
0,5.472153e+17,0.0
1,4.941503e+17,0.0
2,3.352578e+17,0.0
3,4.172473e+17,1.0
4,4.368761e+17,0.0
5,3.553335e+17,0.0
6,3.930382e+17,0.0
7,5.242198e+17,0.0
8,5.427285e+17,0.0
9,5.390291e+17,0.0


In [None]:
num_of_splits = 5
num_of_features = 20

for topic in tq.utils.get_topics():
    print("Processing Topic: {}".format(utils.get_readable_topic(topic)))
    preprocess_start_time = time.time()    
    data_features, data_labels, data_transformer = data.get_transformed_data(topic, 
                                                                             shuffle=True)
    print("Data Preprocess in {:.2f} Seconds".format(time.time() - preprocess_start_time))

    # perform k fold evaluation
    training_start_time = time.time()
    firehose_phase1_stats = []
    firehose_train_aveps = []
    firehose_test_aveps = []
    firehose_patks = []
    
    topk_feats = []
    topk_phase1_stats = []
    topk_train_aveps = []
    topk_test_aveps = []
    topk_patks = []
    
    gur_cilp_feats = []
    gur_cilp_phase1_stats = []
    gur_cilp_train_aveps = []
    gur_cilp_test_aveps = []
    gur_cilp_patks = []

    greedy_cilp_feats = []
    greedy_cilp_phase1_stats = []
    greedy_cilp_train_aveps = []
    greedy_cilp_test_aveps = []
    greedy_cilp_patks = []
    
    gur_wilp_feats = []
    gur_wilp_phase1_stats = []
    gur_wilp_train_aveps = []
    gur_wilp_test_aveps = []
    gur_wilp_patks = []

    greedy_wilp_feats = []
    greedy_wilp_phase1_stats = []
    greedy_wilp_train_aveps = []
    greedy_wilp_test_aveps = []
    greedy_wilp_patks = []
    
    gur_cailp_feats = []
    gur_cailp_phase1_stats = []
    gur_cailp_train_aveps = []
    gur_cailp_test_aveps = []
    gur_cailp_patks = []
    
    greedy_cailp_feats = []
    greedy_cailp_phase1_stats = []
    greedy_cailp_train_aveps = []
    greedy_cailp_test_aveps = []
    greedy_cailp_patks = []

    for i in range(num_of_splits):    
        print("===split {}/{}===".format(i+1, num_of_splits))
        print("getting train/test data")
        start_time = time.time()
        (train_data, train_target
        , test_data, test_target) = transformers.train_test_split(data_features,
                                                                 data_labels,
                                                                 test_split_index=i,
                                                                 num_of_splits=num_of_splits)
        firehose_phase1_stats.append(utils.get_labeled_data_statistics(train_target))
        print("split done in {:.2f}s".format(time.time() - start_time))
        start_time = time.time()
        
        print("training firehose classifier")
        start_time = time.time()
        firehose_classifier = SGDClassifier(loss='log', class_weight='balanced', penalty='elasticnet')
        firehose_classifier.fit(train_data, train_target)          
        preds_proba = firehose_classifier.predict_proba(train_data)[:, 1]
        firehose_train_aveps.append(average_precision_score(train_target, preds_proba))
        preds_proba = firehose_classifier.predict_proba(test_data)[:, 1]
        firehose_test_aveps.append(average_precision_score(test_target, preds_proba))
        firehose_patks.append(classification.p_at_k_score(test_target, preds_proba, k=100))

        print("firehose classifier trained in {:.2f}s".format(time.time() - start_time))

        # topk 
        print("**TopK")
        topk_feat_index = classification.get_classifier_top_weighted_features(firehose_classifier, k=num_of_features)
        topk_feats.append(data_transformer.get_features_by_index(topk_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                topk_feat_index)
        topk_phase1_stats.append(ph1)
        topk_train_aveps.append(train_avep)
        topk_test_aveps.append(test_avep)
        topk_patks.append(patk)
        
        # MILP formulations
        positive_set, negative_set = transformers.get_positive_negative_set(test_data, test_target)
        
        # gurobi_cilp 
        print("**Gurobi CILP")
        gur_cilp_feat_index = milp.gurobi_cilp(positive_set, k=num_of_features)
        gur_cilp_feats.append(data_transformer.get_features_by_index(gur_cilp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                gur_cilp_feat_index)
        gur_cilp_phase1_stats.append(ph1)
        gur_cilp_train_aveps.append(train_avep)
        gur_cilp_test_aveps.append(test_avep)
        gur_cilp_patks.append(patk)
        
        # greedy_cilp
        print("**Greedy CILP")
        greedy_cilp_feat_index = milp.greedy_cilp(positive_set, k=num_of_features)
        greedy_cilp_feats.append(data_transformer.get_features_by_index(greedy_cilp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                greedy_cilp_feat_index)
        greedy_cilp_phase1_stats.append(ph1)
        greedy_cilp_train_aveps.append(train_avep)
        greedy_cilp_test_aveps.append(test_avep)
        greedy_cilp_patks.append(patk)
        
        # get mi scores
        print("**Getting MI Scores")
        start_time = time.time()
        mi_scores = data.get_mi_scores(test_data, test_target)
        print("MI Scores done in {:.2f} seconds".format(time.time() - start_time))
        
        # gurobi_wilp
        print("**Gurobi WILP")
        gur_wilp_feat_index = milp.gurobi_wilp(positive_set, mi_scores, k=num_of_features)
        gur_wilp_feats.append(data_transformer.get_features_by_index(gur_wilp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                gur_wilp_feat_index)
        gur_wilp_phase1_stats.append(ph1)
        gur_wilp_train_aveps.append(train_avep)
        gur_wilp_test_aveps.append(test_avep)
        gur_wilp_patks.append(patk)
        
        # greedy_wilp
        print("**Greedy WILP")
        greedy_wilp_feat_index = milp.greedy_wilp(positive_set, mi_scores, k=num_of_features)
        greedy_wilp_feats.append(data_transformer.get_features_by_index(greedy_wilp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                greedy_wilp_feat_index)
        greedy_wilp_phase1_stats.append(ph1)
        greedy_wilp_train_aveps.append(train_avep)
        greedy_wilp_test_aveps.append(test_avep)
        greedy_wilp_patks.append(patk)
        
        # gurobi cailp
        print("**Gurobi CAILP")
        start_time = time.time()
        gur_cailp_feat_index = milp.gurobi_cailp(positive_set, negative_set, k=num_of_features)
        print("Gur CAILP in {:.2f} seconds".format(time.time() - start_time))
        gur_cailp_feats.append(data_transformer.get_features_by_index(gur_cailp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                gur_cailp_feat_index)
        gur_cailp_phase1_stats.append(ph1)
        gur_cailp_train_aveps.append(train_avep)
        gur_cailp_test_aveps.append(test_avep)
        gur_cailp_patks.append(patk)
        
        # greedy cailp
        print("**Greedy CAILP")
        start_time = time.time()
        greedy_cailp_feat_index = milp.greedy_cailp(positive_set, negative_set, k=num_of_features)
        print("Greedy CAILP in {:.2f} seconds".format(time.time() - start_time))
        greedy_cailp_feats.append(data_transformer.get_features_by_index(greedy_cailp_feat_index))
        ph1, train_avep, test_avep, patk = utils.run_experiment(train_data, train_target, 
                                                                test_data, test_target,
                                                                greedy_cailp_feat_index)
        greedy_cailp_phase1_stats.append(ph1)
        greedy_cailp_train_aveps.append(train_avep)
        greedy_cailp_test_aveps.append(test_avep)
        greedy_cailp_patks.append(patk)
        
    print("Training Done in {:.2f} Seconds".format(time.time() - training_start_time))
    print("******")
    
    utils.save_results_to_csv(topic, 
                          'firehose', 
                          firehose_phase1_stats, 
                          firehose_train_aveps,
                          firehose_test_aveps,
                          firehose_patks)
    
    utils.save_results_to_csv(topic, 
                          'topk', 
                          topk_phase1_stats, 
                          topk_train_aveps,
                          topk_test_aveps,
                          topk_patks,
                          topk_feats)
    
    utils.save_results_to_csv(topic, 
                          'gurobi_cilp', 
                          gur_cilp_phase1_stats, 
                          gur_cilp_train_aveps,
                          gur_cilp_test_aveps,
                          gur_cilp_patks,
                          gur_cilp_feats)
    
    utils.save_results_to_csv(topic, 
                      'greedy_cilp', 
                      greedy_cilp_phase1_stats, 
                      greedy_cilp_train_aveps,
                      greedy_cilp_test_aveps,
                      greedy_cilp_patks,
                      greedy_cilp_feats)
    
    utils.save_results_to_csv(topic, 
                      'gurobi_wilp', 
                      gur_wilp_phase1_stats, 
                      gur_wilp_train_aveps,
                      gur_wilp_test_aveps,
                      gur_wilp_patks,
                      gur_wilp_feats)
    
    utils.save_results_to_csv(topic, 
                  'greedy_wilp', 
                  greedy_wilp_phase1_stats, 
                  greedy_wilp_train_aveps,
                  greedy_wilp_test_aveps,
                  greedy_wilp_patks,
                  greedy_wilp_feats)
    
    utils.save_results_to_csv(topic, 
                      'gurobi_cailp', 
                      gur_cailp_phase1_stats, 
                      gur_cailp_train_aveps,
                      gur_cailp_test_aveps,
                      gur_cailp_patks,
                      gur_cailp_feats)
    
    utils.save_results_to_csv(topic, 
                      'greedy_cailp', 
                      greedy_cailp_phase1_stats, 
                      greedy_cailp_train_aveps,
                      greedy_cailp_test_aveps,
                      greedy_cailp_patks,
                      greedy_cailp_feats)

Processing Topic: Social Issues


# Varying K

In [None]:
topics = [#'Social_issue',
         'Space',
         'Soccer',
         'Human_Disaster',
         'Tennis',
         'Health',
         'LGBT']
ks = [10, 20, 50, 100, 200]
num_of_splits = 5

for topic in topics:
    print("Processing Topic: {}".format(utils.get_readable_topic(topic)))
    preprocess_start_time = time.time()    
    data_features, data_labels, data_transformer = data.get_transformed_data(topic, 
                                                                             shuffle=True)
    print("Data Preprocess in {:.2f} Seconds".format(time.time() - preprocess_start_time))
    
    tots = []
    gurobi_stats = []
    greedy_stats = []
    for k in ks:
        gurobi_stats.append([])
        greedy_stats.append([])        
    for i in range(num_of_splits):    
        print("===split {}/{}===".format(i+1, num_of_splits))
        print("getting train/test data")
        start_time = time.time()
        (train_data, train_target
        , test_data, test_target) = transformers.train_test_split(data_features,
                                                                 data_labels,
                                                                 test_split_index=i,
                                                                 num_of_splits=num_of_splits)
        tots.append(utils.get_labeled_data_statistics(train_target))
        print("split done in {:.2f}s".format(time.time() - start_time))
        positive_set, negative_set = transformers.get_positive_negative_set(test_data, test_target, pos_count=3000) 
        for i, k in enumerate(ks):
            print("K = {}".format(k))
            start_time = time.time()
            gurobi_cailp_feat_index = milp.gurobi_cailp(positive_set, negative_set, k=k)
            _, filtered_train_labels = data.filter_matrix_by_index(train_data,
                                                                   train_target,
                                                                   gurobi_cailp_feat_index) 
            gurobi_stats[i].append(utils.get_labeled_data_statistics(filtered_train_labels))
            greedy_cailp_feat_index = milp.greedy_cailp(positive_set, negative_set, k=k)
            _, filtered_train_labels = data.filter_matrix_by_index(train_data,
                                                                   train_target,
                                                                   greedy_cailp_feat_index) 

            greedy_stats[i].append(utils.get_labeled_data_statistics(filtered_train_labels))
            print("filtered in {}".format(time.time() - start_time))
    with open('../results/2019/' + topic + '/gurobi_vark.pickle', 'wb') as f:
        pickle.dump(gurobi_stats, f)
    with open('../results/2019/' + topic + '/greedy_vark.pickle', 'wb') as f:
        pickle.dump(greedy_stats, f)

Processing Topic: Space
filtered in 1354.32101202
===split 2/5===
getting train/test data
split done in 3.40s
K = 10
0
1
2
3
4
5
6
7
8
9
