In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np 
import re

from sklearn.metrics import precision_recall_curve, mutual_info_score, normalized_mutual_info_score

from pyspark.ml.feature import HashingTF, Tokenizer, IDF, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, lit, col, when, lower

import matplotlib.pyplot as plt

from hashtag_dict import topic_dict

from Evaluation import *
from FeatureExtraction import *
from Filter import *
from TestFramework import *
from LPFormulations import *
from CoverageCalculator import *

In [3]:
class API:
    def __init__(self, directory, frac=1, seed=123):
        if frac < 1:
            self.data = spark.read.parquet(directory).sample(False, frac, seed)
        else:
            self.data = spark.read.parquet(directory)
          
    def get_raw_data(self):
         return self.data

    def get_filtered_data(self, terms=[], hashtags=None, users=None, locs=None, mentions=None):   
        return filter_data(self.data, terms, hashtags, users, locs, mentions)
    
    
class DataPreprocessor():
    def __init__(self, data, topic, load_labled=False, filter_empty_hashtags=False):
        if filter_empty_hashtags == True:
            self.data = data.filter(col("hashtag") != "empty_hashtag")
        else:
            self.data = data

        if load_labled == True:
            labled_dir = "/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/labeled_data/" + topic
            pos_dir = labled_dir + "/pos"
            topical_tweet_ids = spark.read.parquet(pos_dir)
            topical_tweets = topical_tweet_ids.join(self.data, "tweet_id").withColumn("label", lit(1))
            
            neg_dir = labled_dir + "/neg"
            non_topical_tweet_ids = spark.read.parquet(neg_dir).sample(False, 0.1, 123)
            non_topical_tweets = non_topical_tweet_ids.join(self.data, "tweet_id").withColumn("label", lit(0))

            self.labled_data = topical_tweets.union(non_topical_tweets)
        else:
            self.labled_data = None

        tokenizer = Tokenizer(inputCol="hashtag", outputCol="each_hashtag")
        hashtags_df = tokenizer.transform(self.data)

        hashtag = hashtags_df.select("tweet_id","create_time","each_hashtag")
        self.hash_exploded = hashtag.withColumn('each_hashtag', explode('each_hashtag'))
    
    def get_labled_data(self, topic):
        if self.labled_data == None:
            topic_lables = topic_dict[topic]

            topical_ids = self.hash_exploded.select(self.hash_exploded.tweet_id)\
                                                    .where(self.hash_exploded.each_hashtag\
                                                            .isin(topic_lables))\
                                                    .distinct().cache()

            labled_topical = topical_ids.withColumn("topical", lit(1))
            self.labled_data = self.data.join(labled_topical, self.data.tweet_id == labled_topical.tweet_id, "left")\
                                                      .select(self.data.create_time,\
                                                             self.data.from_id,\
                                                             self.data.from_user,\
                                                             self.data.hashtag,\
                                                             self.data.location,\
                                                             self.data.mention,\
                                                             self.data.tweet_id,\
                                                             self.data.term,\
                                                              F.when(labled_topical.topical == 1, 1.0)\
                                                                .otherwise(0.0).alias("label"))
        return self.labled_data
    
    def get_num_of_positvies(self):
        if self.labled_data == None:
            return 0
        
        return self.labled_data.where(col("label") == 1.0).count()
    
    def get_data_size(self):
        if self.labled_data == None:
            return 0
        
        return self.labled_data.count()
        
    def temporal_split(self, topic, balance_data=True, seed=0, remove_topic_hashtags=False, ch=False):
        topic_lables = topic_dict[topic]
        labled_data = self.get_labled_data(topic)
        tweet_labels = labled_data.select("tweet_id", "label")
        
        df_birthday = self.hash_exploded.join(tweet_labels,\
                                 self.hash_exploded.tweet_id == tweet_labels.tweet_id,\
                                 "inner").select(self.hash_exploded.create_time,\
                                                 self.hash_exploded.each_hashtag,\
                                                 self.hash_exploded.tweet_id)
        
        ## Find out the "birthday", or the earliest appearing time of each hashtag. 
        ## (add an extra column of 1 to mark as topical, will be used in a join later)
        ordered_hashtag_set = df_birthday.\
                              groupby("each_hashtag").\
                              agg({"create_time": "min"}).\
                              orderBy('min(create_time)', ascending=True).\
                              withColumnRenamed("min(create_time)", "birthday").\
                              where(df_birthday.each_hashtag.isin(topic_lables)).cache()
                            
        time_span = ordered_hashtag_set.count()

        train_valid_split_time = np.floor(np.multiply(time_span, 0.75)).astype(int)
        valid_test_split_time = np.floor(np.multiply(time_span, 0.85)).astype(int)

        # Converting to Pandas for random row access.
        pd_ordered_hashtag_set = ordered_hashtag_set.toPandas()
        
        # locate the timestamp of the cutoff point. Will be used later to split Dataframe.
        train_valid_time = pd_ordered_hashtag_set.iloc[train_valid_split_time]['birthday']
        valid_test_time = pd_ordered_hashtag_set.iloc[valid_test_split_time]['birthday']
    
        training_set = labled_data.where(col("create_time") <= train_valid_time)
        validation_set = labled_data.where((col("create_time") > train_valid_time) \
                                           & (col("create_time") <= valid_test_time))
        test_set = labled_data.where(col("create_time") > valid_test_time)
        
        if remove_topic_hashtags == True:
            train_hashtags = pd_ordered_hashtag_set[:train_valid_split_time]['each_hashtag'].tolist()
            valid_hashtags = pd_ordered_hashtag_set[train_valid_split_time:valid_test_split_time]['each_hashtag']\
                                .tolist()
            test_hashtags = pd_ordered_hashtag_set[valid_test_split_time:]['each_hashtag'].tolist()
            
            valid_test_hashtags = valid_hashtags + test_hashtags
            hashtags_filter = [re.sub(r'(.*)', r'\\b\1\\b', hashtag) for hashtag in valid_test_hashtags]
            valid_test_hashtags_regex = '|'.join(hashtags_filter)
            if ch == True:
                invalid_train_ids = (training_set.filter(lower(training_set['hashtag']).rlike(valid_test_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()
            else:
                invalid_train_ids = (training_set.filter(training_set['hashtag'].rlike(valid_test_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()                
            training_set = training_set.where(~col("tweet_id").isin(invalid_train_ids)) 

            train_test_hashtags = train_hashtags + test_hashtags
            hashtags_filter = [re.sub(r'(.*)', r'\\b\1\\b', hashtag) for hashtag in train_test_hashtags]
            train_test_hashtags_regex = '|'.join(hashtags_filter)
            if ch == True:
                invalid_validation_ids = (validation_set.filter(lower(validation_set['hashtag']).rlike(train_test_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()
            else:
                invalid_validation_ids = (validation_set.filter(validation_set['hashtag'].rlike(train_test_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()

                
            print("Invalid Validation:" + str(len(invalid_validation_ids)))
            validation_set = validation_set.where(~col("tweet_id").isin(invalid_validation_ids))
            
            train_valid_hashtags = train_hashtags + valid_hashtags
            train_valid_hashtags_filter = [re.sub(r'(.*)', r'\\b\1\\b', hashtag) for hashtag in train_valid_hashtags]
            train_valid_hashtags_regex = '|'.join(train_valid_hashtags_filter)
            if ch == True:
                invalid_test_ids = (test_set.filter(lower(test_set['hashtag']).rlike(train_valid_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()
            else:
                invalid_test_ids = (test_set.filter(test_set['hashtag'].rlike(train_test_hashtags_regex)))\
                                        .select("tweet_id").distinct().rdd.flatMap(lambda x: x).collect()

                
            print("Invalid Test:" + str(len(invalid_test_ids)))
                
            test_set = test_set.where(~col("tweet_id").isin(invalid_test_ids))
        
        if balance_data == True:
            train_pos_count = training_set.where(col("label") == 1).count()
            train_neg_count = training_set.where(col("label") == 0).count()
            train_pos_neg_ratio = float(train_pos_count) / train_neg_count
            
            training_set_balanced = training_set.sampleBy("label", fractions={0: 2*train_pos_neg_ratio, 1: 1}, seed=seed)
            
            valid_pos_count = validation_set.where(col("label") == 1).count()
            valid_neg_count = validation_set.count() - valid_pos_count
            valid_pos_neg_ratio = float(valid_pos_count) / valid_neg_count
            
            validation_set_balanced = validation_set.sampleBy("label", fractions={0.0: valid_pos_neg_ratio, 1.0: 1}, seed=seed)
            
            return training_set, training_set_balanced, validation_set, validation_set_balanced, test_set
            
        return training_set, validation_set, test_set

In [18]:
def get_selected_features(topic, tf, test, num_feat=50):

    domain_stopwords = ['empty_location', 'empty_mention']#,'rt', 'amp', 'http']
    positive_set, negative_set = get_transformed_data(test, tf.get_pipeline(),\
                                                      stopwords=domain_stopwords)

    num_positive, feature_positive_coverage, num_negative, feature_negative_coverage = \
        get_coverage(tf.get_pipeline(), positive_set, negative_set)
        
    positive_set_subset, negative_set_subset = subset_transformed_data(positive_set, 2500, negative_set=negative_set)

    num_positive_subset, feature_positive_coverage_subset, num_negative_subset, feature_negative_coverage_subset = \
        get_coverage(tf.get_pipeline(), positive_set_subset, negative_set_subset)
    
    print("Subsetted")
    
    baseline = topk_features(tf.get_pipeline(), k=num_feat + 20)
    baseline = baseline[20:]

    basic = max_cover(feature_positive_coverage_subset, num_positive_subset, k=num_feat)
    print("basic")
    if len(test) > 3000:
        test_subset = test.sample(n=3000, random_state=123)
    else:
        test_subset = test

    mi_scores = get_mi_scores(test_subset, tf.get_pipeline(), feature_positive_coverage)    
    num_tweets = len(test_subset)
    num_positives = len(test_subset[test_subset.label == 1])
    mi = max_cover_with_mutual_information_normalized(feature_positive_coverage_subset, \
                                                                        mi_scores, num_positive_subset, \
                                                                        num_positive_subset, k=num_feat)
    print("mi")
    greedy = greedy_max_cover(positive_set, negative_set, tf.get_pipeline(), k=num_feat)

    solver = max_cover_with_negs_unweighted(feature_positive_coverage_subset,\
                               feature_negative_coverage_subset,\
                               num_positive_subset, num_negative_subset,\
                               k=num_feat, time_limit_secs=600)

    return baseline, basic, mi, greedy, solver

In [8]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Human_Disaster", "Tennis", "LGBT","Health", "Social_issue", "Space", "Soccer"]
#topics = [ "LGBT","Health", "Social_issue", "Space", ]
#"Natr_Disaster",
for topic in topics:
    print(topic)
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    test = test_pd.drop_duplicates(subset='term').reset_index()
    valid = validation_pd.drop_duplicates(subset='term').reset_index()
    train = train_pd.drop_duplicates(subset='term').reset_index()

    tf = TestFramework(train, valid, test)
    
    baseline, basic, mi, greedy, solver = get_selected_features(topic, tf, test)
    
    _, _, ideal_avep, _ = tf.get_ideal_performance()
    
    #filtered_baseline = tf.get_filtered_data_by_index(baseline)
    #_, _, baseline_avep, _ = tf.run_with_filtered_test_data(filtered_baseline) 
    
    filtered_basic = tf.get_filtered_data_by_index(basic)
    _, _, basic_avep, _ = tf.run_with_filtered_test_data(filtered_basic)
    
    filtered_mi = tf.get_filtered_data_by_index(mi)
    _, _, mi_avep, _ = tf.run_with_filtered_test_data(filtered_mi)
    
    filtered_greedy = tf.get_filtered_data_by_index(greedy)
    _, _, greedy_avep, _ = tf.run_with_filtered_test_data(filtered_greedy)
    
    filtered_solver = tf.get_filtered_data_by_index(solver)
    _, _, solver_avep, _ = tf.run_with_filtered_test_data(filtered_solver)
    
    
    results_pd = pd.DataFrame()

    # Ideal
    pos = len(test[test.label == 1])
    tot = len(test)
    print("Number of Topical Tweets w/o duplicates= {0}".format(pos))
    print("Number of Tweets w/o duplicates = {0}".format(tot))

    results_pd = results_pd.append([["Ideal", pos, tot, 1.0, (float)(pos) / tot, ideal_avep]])

    # Baseline
    #pos_ret = len(filtered_baseline[filtered_baseline.label == 1])
    #tot_ret = len(filtered_baseline)
    #recall = (float)(pos_ret) / pos

    #results_pd = results_pd.append([["Baseline", pos_ret, tot_ret\
    #                                 , recall, (float)(pos_ret) / tot_ret, baseline_avep]])

    # Basic LP
    pos_ret = len(filtered_basic[filtered_basic.label == 1])
    tot_ret = len(filtered_basic)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["Basic LP Formulation", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, basic_avep]])

    # MI 
    pos_ret = len(filtered_mi[filtered_mi.label == 1])
    tot_ret = len(filtered_mi)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP Formulation With Mutual Information", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, mi_avep]])
    
    # Greedy 
    pos_ret = len(filtered_greedy[filtered_greedy.label == 1])
    tot_ret = len(filtered_greedy)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP with Negative Coverage (Greedy Solver)", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, greedy_avep]])
    
    # Solver 
    pos_ret = len(filtered_solver[filtered_solver.label == 1])
    tot_ret = len(filtered_solver)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP with Negative Coverage (Gurobi Solver)", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, solver_avep]])    
    
    results_pd.columns = ["Model", "Topical Retrieved", "Total Retrieved", "Recall", "Precision", "AveP"]
    
    print(topic)
    print(results_pd.to_latex())
    
    #0.949866

Human_Disaster


  interactivity=interactivity, compiler=compiler, result=result)


KeyboardInterrupt: 

In [16]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Soccer"]

for topic in topics:
    print(topic)
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    test = test_pd.drop_duplicates(subset='term').reset_index()
    valid = validation_pd.drop_duplicates(subset='term').reset_index()
    train = train_pd.drop_duplicates(subset='term').reset_index()

    tf = TestFramework(train, valid, test)
    
    baseline, basic, mi, greedy, solver = get_selected_features(topic, tf, test)
    
    _, _, ideal_avep, _ = tf.get_ideal_performance()
    
    filtered_baseline = tf.get_filtered_data_by_index(baseline)
    _, _, baseline_avep, _ = tf.run_with_filtered_test_data(filtered_baseline) 
    
    filtered_basic = tf.get_filtered_data_by_index(basic)
    _, _, basic_avep, _ = tf.run_with_filtered_test_data(filtered_basic)
    
    filtered_mi = tf.get_filtered_data_by_index(mi)
    _, _, mi_avep, _ = tf.run_with_filtered_test_data(filtered_mi)
    
    filtered_greedy = tf.get_filtered_data_by_index(greedy)
    _, _, greedy_avep, _ = tf.run_with_filtered_test_data(filtered_greedy)
    
    filtered_solver = tf.get_filtered_data_by_index(solver)
    _, _, solver_avep, _ = tf.run_with_filtered_test_data(filtered_solver)
    
    
    results_pd = pd.DataFrame()

    # Ideal
    pos = len(test[test.label == 1])
    tot = len(test)
    print("Number of Topical Tweets w/o duplicates= {0}".format(pos))
    print("Number of Tweets w/o duplicates = {0}".format(tot))

    results_pd = results_pd.append([["Ideal", pos, tot, 1.0, (float)(pos) / tot, ideal_avep]])

    # Baseline
    pos_ret = len(filtered_baseline[filtered_baseline.label == 1])
    tot_ret = len(filtered_baseline)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["Baseline", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, baseline_avep]])

    # Basic LP
    pos_ret = len(filtered_basic[filtered_basic.label == 1])
    tot_ret = len(filtered_basic)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["Basic LP Formulation", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, basic_avep]])

    # MI 
    pos_ret = len(filtered_mi[filtered_mi.label == 1])
    tot_ret = len(filtered_mi)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP Formulation With Mutual Information", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, mi_avep]])
    
    # Greedy 
    pos_ret = len(filtered_greedy[filtered_greedy.label == 1])
    tot_ret = len(filtered_greedy)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP with Negative Coverage (Greedy Solver)", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, greedy_avep]])
    
    # Solver 
    pos_ret = len(filtered_solver[filtered_solver.label == 1])
    tot_ret = len(filtered_solver)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["LP with Negative Coverage (Gurobi Solver)", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, solver_avep]])    
    
    results_pd.columns = ["Model", "Topical Retrieved", "Total Retrieved", "Recall", "Precision", "AveP"]
    
    print(topic)
    print(results_pd.to_latex())
    
    #0.949866

Soccer
Initializing Classifier....
Complete!
#pos/#neg = 0.00506089887743
12254
24508
transform done!
Subsetted
basic
mi
Number of Topical Tweets w/o duplicates= 12254
Number of Tweets w/o duplicates = 2433563
Soccer
\begin{tabular}{llrrrrr}
\toprule
{} &                                      Model &  Topical Retrieved &  Total Retrieved &    Recall &  Precision &      AveP \\
\midrule
0 &                                      Ideal &              12254 &          2433563 &  1.000000 &   0.005035 &  0.040087 \\
0 &                                   Baseline &                 43 &              472 &  0.003509 &   0.091102 &  0.261755 \\
0 &                       Basic LP Formulation &               8203 &           886806 &  0.669414 &   0.009250 &  0.059153 \\
0 &     LP Formulation With Mutual Information &               7742 &           818448 &  0.631794 &   0.009459 &  0.062986 \\
0 &  LP with Negative Coverage (Greedy Solver) &               6942 &           220222 &  0.566509 &   0

In [5]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Tennis", "Health", "LGBT"]
#"Natr_Disaster", "Social_issue", "Space", "Soccer", "Human_Disaster", 
for topic in topics:
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    test = test_pd.drop_duplicates(subset='term').reset_index()
    valid = validation_pd.drop_duplicates(subset='term').reset_index()
    train = train_pd.drop_duplicates(subset='term').reset_index()

    tf = TestFramework(train, valid, test)

    domain_stopwords = ['empty_location', 'empty_mention','rt', 'amp', 'http']
    print("cov begin")
    positive_set, negative_set = get_transformed_data(test, tf.get_pipeline(),\
                                                      stopwords=domain_stopwords)
    
    num_positive, feature_positive_coverage, num_negative, feature_negative_coverage = \
        get_coverage(tf.get_pipeline(), positive_set, negative_set)

    print("Coverage done")   
    if len(test) > 6000:
        test_subset = test.sample(n=6000, random_state=123)
    else:
        test_subset = test

    mi_scores = get_mi_scores(test_subset, tf.get_pipeline(), feature_positive_coverage)    
    
    print("MI done")   
    num_tweets = len(test_subset)
    num_positives = len(test_subset[test_subset.label == 1])
    cilp = greedy_cilp(positive_set, tf.get_pipeline(), k=50)
    wilp = greedy_wilp(positive_set, mi_scores, tf.get_pipeline(), k=50)
    cailp = greedy_max_cover(positive_set, negative_set, tf.get_pipeline(), k=50)
    #print("methods ran")
    filtered_cilp = tf.get_filtered_data_by_index(cilp)
    #print("cilp filtered")
    _, _, cilp_avep, _ = tf.run_with_filtered_test_data(filtered_cilp)
    #print("cilp avep")
    filtered_wilp = tf.get_filtered_data_by_index(wilp)
    #print("wilp filtered")
    _, _, wilp_avep, _ = tf.run_with_filtered_test_data(filtered_wilp)    
    #print("wilp avep")

    filtered_cailp = tf.get_filtered_data_by_index(cailp)
    #print("cailp filtered")
    _, _, cailp_avep, _ = tf.run_with_filtered_test_data(filtered_cailp)
    #print("cailp avp")

    results_pd = pd.DataFrame()

    pos = len(test[test.label == 1])
    tot = len(test)

    # CILP
    pos_ret = len(filtered_cilp[filtered_cilp.label == 1])
    tot_ret = len(filtered_cilp)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["CILP", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, cilp_avep]])

    # WILP
    pos_ret = len(filtered_wilp[filtered_wilp.label == 1])
    tot_ret = len(filtered_wilp)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["WILP", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, wilp_avep]])

    # CAILP
    pos_ret = len(filtered_cailp[filtered_cailp.label == 1])
    tot_ret = len(filtered_cailp)
    recall = (float)(pos_ret) / pos

    results_pd = results_pd.append([["CAILP", pos_ret, tot_ret\
                                     , recall, (float)(pos_ret) / tot_ret, cailp_avep]])
        
    results_pd.columns = ["Model", "Topical Retrieved", "Total Retrieved", "Recall", "Precision", "AveP"]
    
    print(topic)
    print(results_pd.to_latex())    

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Initializing Classifier....
Complete!
cov begin
#pos/#neg = 0.000238417915043
491
982
transform done!
Coverage done




MI done
Tennis
\begin{tabular}{llrrrrr}
\toprule
{} &  Model &  Topical Retrieved &  Total Retrieved &    Recall &  Precision &      AveP \\
\midrule
0 &   CILP &                393 &           568264 &  0.800407 &   0.000692 &  0.035997 \\
0 &   WILP &                393 &           568264 &  0.800407 &   0.000692 &  0.035997 \\
0 &  CAILP &                358 &           154982 &  0.729124 &   0.002310 &  0.047435 \\
\bottomrule
\end{tabular}



  interactivity=interactivity, compiler=compiler, result=result)


Initializing Classifier....
Complete!
cov begin
#pos/#neg = 0.00113502198081
970
1940
transform done!
Coverage done
MI done
Health
\begin{tabular}{llrrrrr}
\toprule
{} &  Model &  Topical Retrieved &  Total Retrieved &    Recall &  Precision &      AveP \\
\midrule
0 &   CILP &                702 &           305196 &  0.723711 &   0.002300 &  0.010576 \\
0 &   WILP &                702 &           305196 &  0.723711 &   0.002300 &  0.010576 \\
0 &  CAILP &                618 &           104669 &  0.637113 &   0.005904 &  0.019160 \\
\bottomrule
\end{tabular}

Initializing Classifier....
Complete!
cov begin
#pos/#neg = 0.000473015924282
1611
3222
transform done!
Coverage done
MI done
LGBT
\begin{tabular}{llrrrrr}
\toprule
{} &  Model &  Topical Retrieved &  Total Retrieved &    Recall &  Precision &      AveP \\
\midrule
0 &   CILP &                914 &          1478121 &  0.567349 &   0.000618 &  0.001090 \\
0 &   WILP &                914 &          1478121 &  0.567349 &   0.000618 &

# Effect of Varying K

In [5]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Social_issue", "Soccer", "Space", "Human_Disaster", "Tennis", "Health", "LGBT", "Natr_Disaster"]
ks = [10, 20, 50, 100, 200]

results_pd = pd.DataFrame()

for topic in topics:
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    test = test_pd.drop_duplicates(subset='term').reset_index()
    valid = validation_pd.drop_duplicates(subset='term').reset_index()
    train = train_pd.drop_duplicates(subset='term').reset_index()

    tf = TestFramework(train, valid, test)
    
    domain_stopwords = ['rt', 'amp', 'empty_location', 'empty_mention']
    positive_set, negative_set = get_transformed_data(test, tf.get_pipeline(),\
                                                      stopwords=domain_stopwords)

    num_positive, feature_positive_coverage, num_negative, feature_negative_coverage = \
        get_coverage(tf.get_pipeline(), positive_set, negative_set)

    pos = len(test[test.label == 1])
    tot = len(test)
    
    for k in ks:
        greedy = greedy_max_cover(positive_set, negative_set, tf.get_pipeline(), k=k)    
        filtered_greedy = tf.get_filtered_data_by_index(greedy) 
        
        _, _, avep, _ = tf.run_with_filtered_test_data(filtered_greedy)
        
        # Greedy 
        pos_ret = len(filtered_greedy[filtered_greedy.label == 1])
        tot_ret = len(filtered_greedy)
        recall = (float)(pos_ret) / pos

        results_pd = results_pd.append([[k, pos_ret, tot_ret\
                                         , recall, (float)(pos_ret) / tot_ret, avep]])
        
    
    results_pd = results_pd.append([[topic]])
    
results_pd.columns = ["K", "Topical Retrieved", "Total Retrieved", "Recall", "Precision", "AveP"]    
print(results_pd.to_latex())

  interactivity=interactivity, compiler=compiler, result=result)


Initializing Classifier....
Complete!
#pos/#neg = 0.0193934962322
4774
9548
transform done!




Initializing Classifier....
Complete!
#pos/#neg = 0.00506089887743
12254
24508
transform done!


  interactivity=interactivity, compiler=compiler, result=result)


Initializing Classifier....
Complete!
#pos/#neg = 0.000943669106367
2623
5246
transform done!


  interactivity=interactivity, compiler=compiler, result=result)


Initializing Classifier....
Complete!
#pos/#neg = 0.008543642424
12452
24904
transform done!
Initializing Classifier....
Complete!
#pos/#neg = 0.000238417915043
491
982
transform done!
Initializing Classifier....
Complete!
#pos/#neg = 0.00113502198081
970
1940
transform done!
Initializing Classifier....
Complete!
#pos/#neg = 0.000473015924282
1611
3222
transform done!
Initializing Classifier....
Complete!
#pos/#neg = 0.00115122440869
1179
2358
transform done!
\begin{tabular}{llrrrrr}
\toprule
{} &               K &  Topical Retrieved &  Total Retrieved &    Recall &  Precision &      AveP \\
\midrule
0 &              10 &             1913.0 &           9218.0 &  0.400712 &   0.207529 &  0.377452 \\
0 &              20 &             2394.0 &          12289.0 &  0.501466 &   0.194808 &  0.352256 \\
0 &              50 &             3133.0 &          25110.0 &  0.656263 &   0.124771 &  0.230567 \\
0 &             100 &             3578.0 &          35998.0 &  0.749476 &   0.099394 &  0.19

In [6]:
results_pd.to_csv("vark.csv", encoding='utf-8', index=False)

In [14]:
results_pd.append([[k, pos_ret, tot_ret\
                                         , recall, (float)(pos_ret) / tot_ret, avep]])

Unnamed: 0,0,1,2,3,4,5,AveP,K,Precision,Recall,Topical Retrieved,Total Retrieved
0,,,,,,,0.377452,10.0,0.207529,0.400712,1913.0,9218.0
0,,,,,,,0.352256,20.0,0.194808,0.501466,2394.0,12289.0
0,,,,,,,0.230567,50.0,0.124771,0.656263,3133.0,25110.0
0,,,,,,,0.192682,100.0,0.099394,0.749476,3578.0,35998.0
0,,,,,,,0.145151,200.0,0.070177,0.836406,3993.0,56899.0
0,,,,,,,0.109731,10.0,0.044405,0.341603,4186.0,94268.0
0,,,,,,,0.101396,20.0,0.036273,0.445406,5458.0,150471.0
0,,,,,,,0.087788,50.0,0.031523,0.566509,6942.0,220222.0
0,,,,,,,0.078214,100.0,0.025887,0.655704,8035.0,310391.0
0,,,,,,,0.073354,200.0,0.021898,0.721315,8839.0,403642.0


## Selected Features

In [19]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Natr_Disaster", "Soccer","Space", "Human_Disaster", "Tennis", "Health", "LGBT", "Social_issue"]

results5_pd = pd.DataFrame()
results10_pd = pd.DataFrame()
results20_pd = pd.DataFrame()
results100_pd = pd.DataFrame()

for topic in topics:
    print(topic)
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    test = test_pd.drop_duplicates(subset='term').reset_index()
    valid = validation_pd.drop_duplicates(subset='term').reset_index()
    train = train_pd.drop_duplicates(subset='term').reset_index()

    tf = TestFramework(train, valid, test)
    
    baseline, basic, mi, _, solver = get_selected_features(topic, tf, test, num_feat=5)
    
    baseline_features = get_feature_by_index(tf.get_pipeline(), baseline)
    basic_features = get_feature_by_index(tf.get_pipeline(), basic)
    mi_features = get_feature_by_index(tf.get_pipeline(), mi)
#    greedy_features = get_feature_by_index(tf.get_pipeline(), greedy)
    solver_features = get_feature_by_index(tf.get_pipeline(), solver)
    
    results5_pd = results5_pd.append(pd.concat([baseline_features, basic_features, mi_features, solver_features], axis=1))\
    
    baseline, basic, mi, _, solver = get_selected_features(topic, tf, test, num_feat=10)
    
    baseline_features = get_feature_by_index(tf.get_pipeline(), baseline)
    basic_features = get_feature_by_index(tf.get_pipeline(), basic)
    mi_features = get_feature_by_index(tf.get_pipeline(), mi)
#    greedy_features = get_feature_by_index(tf.get_pipeline(), greedy)
    solver_features = get_feature_by_index(tf.get_pipeline(), solver)
    
    results10_pd = results10_pd.append(pd.concat([baseline_features, basic_features, mi_features, solver_features], axis=1))\
    
    baseline, basic, mi, _, solver = get_selected_features(topic, tf, test, num_feat=20)
    
    baseline_features = get_feature_by_index(tf.get_pipeline(), baseline)
    basic_features = get_feature_by_index(tf.get_pipeline(), basic)
    mi_features = get_feature_by_index(tf.get_pipeline(), mi)
#    greedy_features = get_feature_by_index(tf.get_pipeline(), greedy)
    solver_features = get_feature_by_index(tf.get_pipeline(), solver)
    
    results20_pd = results20_pd.append(pd.concat([baseline_features, basic_features, mi_features, solver_features], axis=1))\
    
    baseline, basic, mi, _, solver = get_selected_features(topic, tf, test, num_feat=100)
    
    baseline_features = get_feature_by_index(tf.get_pipeline(), baseline)
    basic_features = get_feature_by_index(tf.get_pipeline(), basic)
    mi_features = get_feature_by_index(tf.get_pipeline(), mi)
#    greedy_features = get_feature_by_index(tf.get_pipeline(), greedy)
    solver_features = get_feature_by_index(tf.get_pipeline(), solver)
    
    results100_pd = results100_pd.append(pd.concat([baseline_features, basic_features, mi_features, solver_features], axis=1))\

Natr_Disaster
Initializing Classifier....
Complete!
#pos/#neg = 0.00115122440869
1179
2358
transform done!




Subsetted
basic
mi
#pos/#neg = 0.00115122440869
1179
2358
transform done!
Subsetted
basic
mi
#pos/#neg = 0.00115122440869
1179
2358
transform done!
Subsetted
basic
mi
#pos/#neg = 0.00115122440869
1179
2358
transform done!
Subsetted
basic
mi
Soccer
Initializing Classifier....
Complete!
#pos/#neg = 0.00506089887743
12254
24508
transform done!
Subsetted
basic
mi
#pos/#neg = 0.00506089887743
12254
24508
transform done!
Subsetted
basic
mi
#pos/#neg = 0.00506089887743
12254
24508
transform done!
Subsetted
basic
mi
#pos/#neg = 0.00506089887743
12254
24508
transform done!
Subsetted
basic
mi
Space
Initializing Classifier....
Complete!
#pos/#neg = 0.000943669106367
2623
5246
transform done!
Subsetted
basic
mi
#pos/#neg = 0.000943669106367
2623
5246
transform done!
Subsetted
basic
mi
#pos/#neg = 0.000943669106367
2623
5246
transform done!
Subsetted
basic
mi
#pos/#neg = 0.000943669106367
2623
5246
transform done!
Subsetted
basic
mi
Human_Disaster
Initializing Classifier....
Complete!
#pos/#neg = 0

In [20]:
results5_pd.to_csv("feats5.csv", encoding='utf-8', index=False)
results10_pd.to_csv("feats10.csv", encoding='utf-8', index=False)
results20_pd.to_csv("feats20.csv", encoding='utf-8', index=False)
results100_pd.to_csv("feats100.csv", encoding='utf-8', index=False)
#results_pd.to_csv("feats100.csv", encoding='utf-8', index=False)

In [16]:
results_pd.to_csv("feats100.csv", encoding='utf-8', index=False)

In [23]:
len(results20_pd)

160

In [27]:
results100_pd.iloc[:800].to_csv("feats100.csv", encoding='utf-8', index=False)

In [30]:
r = results100_pd.iloc[:800]

In [None]:
r.to_csv("feats100.csv", encoding='utf-8', index=False)

In [31]:
pd.read_csv("feats100.csv")

Unnamed: 0,Feature,Type,Feature.1,Type.1,Feature.2,Type.2,Feature.3,Type.3
0,hurricanesandy,Hashtag,aerial,Term,aerial,Term,61,Term
1,earthquakeph,Hashtag,aftermath,Term,aftermath,Term,aap,Term
2,abfloods,Hashtag,afternoon,Term,alive,Term,affected,Term
3,philippines,Term,alive,Term,allah,Term,aftermath,Term
4,philippines,Hashtag,allah,Term,amp,Term,aftershock,Term
5,storm,Term,amp,Term,appears,Term,aftershocks,Term
6,typhoon,Term,appears,Term,area,Term,aid,Term
7,victims,Term,area,Term,areas,Term,appears,Term
8,corkfloods,Hashtag,army,Term,army,Term,area,Term
9,earthquake,Term,beautiful,Term,avoid,Term,areas,Term


In [7]:
temporal_split_directory = '/mnt/1e69d2b1-91a9-473c-a164-db90daf43a3d/splitted_data/'
topics = ["Natr_Disaster", "Soccer","Space", "Human_Disaster", "Tennis", "Health", "LGBT", "Social_issue"]

for topic in topics:
    print(topic)
    train_pd = pd.read_csv(temporal_split_directory + topic + "/training2.csv").dropna()
    validation_pd = pd.read_csv(temporal_split_directory + topic + "/validation2.csv").dropna()
    test_pd = pd.read_csv(temporal_split_directory + topic + "/test2.csv").dropna().reset_index()

    print("Training: ", len(train_pd[train_pd.label == 1]))
    print("Validation: ", len(validation_pd[validation_pd.label == 1]))
    print("Test: ", len(test_pd[test_pd.label == 1]))

Natr_Disaster
('Training: ', 49664)
('Validation: ', 2830)
('Test: ', 1307)
Soccer
('Training: ', 19140)
('Validation: ', 1968)
('Test: ', 16570)
Space


  interactivity=interactivity, compiler=compiler, result=result)


('Training: ', 9963)
('Validation: ', 372)
('Test: ', 4202)
Human_Disaster


  interactivity=interactivity, compiler=compiler, result=result)


('Training: ', 162206)
('Validation: ', 54070)
('Test: ', 17513)
Tennis
('Training: ', 33770)
('Validation: ', 442)
('Test: ', 508)
Health
('Training: ', 146110)
('Validation: ', 856)
('Test: ', 1510)
LGBT
('Training: ', 2625)
('Validation: ', 6)
('Test: ', 2004)
Social_issue
('Training: ', 28102)
('Validation: ', 3086)
('Test: ', 7529)
