In [2]:
import csv, os, collections, json
import numpy as np
import pandas as pd
import collections
import preprocessor as p
import itertools

### Labels

In [3]:
## annotators are just the wiki information about the events
## events - there are 6 events in total - each has the following
#
#
#
# 
# events 
#     |_event1
#     |      |_ eventid
#     |      |_  tweets   
#     |             |_[{tweet_1: id, category et al}, {tweet_2}, {tweet_2},......{tweet_n}]
#     |
#     |_ event2
#     |       |_ eventid
#     |       |_  tweets   
#     |             |_[{tweet_1: id, category et al}, {tweet_2}, {tweet_2},......{tweet_n}]
#
#
#
#

In [4]:
filen = 'TRECIS-2018-TestEvents-Labels/'
files = os.listdir(filen)

In [5]:
def getevents(filename):
    with open(filename, 'r') as fp:
        content = json.loads(fp.read())
    allevents = []
    for eachevent in range(len(content['events'])):
        allevents.append(content['events'][eachevent]['eventid'])
    return allevents

In [6]:
def gettweets(filename):
    with open(filename, 'r') as fp:
        content = json.loads(fp.read())
    alltwts = []
    allcats = []
    alltopics = []
    for eachevent in range(len(content['events'])):
        val = len(content['events'][eachevent]['tweets'])
        topics = [content['events'][eachevent]['eventid'] for _ in range(val)]
        twts = [content['events'][eachevent]['tweets'][eachtweet]['postID'] for eachtweet in range(val)]
        cats = [content['events'][eachevent]['tweets'][eachtweet]['categories'] for eachtweet in range(val)]
        alltwts.extend(twts)
        allcats.extend(cats)
        alltopics.extend(topics)
    return alltwts, allcats, alltopics

In [7]:
categories = []
theirids = []
theirtopics = []
for each in files:
    left, right, middle = gettweets(filen+each)
    categories.extend(right)
    theirids.extend(left)
    theirtopics.extend(middle)

In [8]:
len(categories), len(theirids), len(theirtopics)

(19784, 19784, 19784)

In [9]:
labeldf = pd.DataFrame([theirids, theirtopics, categories]).T

In [10]:
labeldf.columns = ['tweetids', 'alltopics', 'categories']

### get methods for features

In [11]:
path = 'datasets/'
files = os.listdir(path)

In [20]:
def get_dataset(name, files):
    event = [eve for eve in files if name in eve]
    print("Files found:", event)
    tweetids = []
    alltweets = []
    alltopics = []
    for index in event:
        with open('datasets_json/'+index , 'r') as fp:
            data = fp.readlines()
            ids = [json.loads(each)['allProperties']['id'] for each in data]
            twts = [json.loads(each)['allProperties']['text'] for each in data]
            topics = [json.loads(each)['topic'] for each in data]
        tweetids.extend(ids)
        alltweets.extend(twts)
        alltopics.extend(topics)
    return tweetids, alltweets, alltopics

In [17]:
events = [each.split('.')[1] for each in files]

In [29]:
events_names =    ['Earthquake','fire', 'typhoon', 'Floods', 'Tornado', 'Bombing', 'Shooting', 'Attacks']

### Event features

In [21]:
def get_features(name, df2):
    tweetids, alltweets, alltopics = get_dataset(name, files)
    # create a dataframe for the tweetids
    df = pd.DataFrame([tweetids, alltopics, alltweets]).T
    # labels the columns - needed for meging with labels
    df.columns = ['tweetids', 'topics', 'tweets']
    print("Number of tweet ids found: ",len(set(tweetids)))
    # dataset where only tweetids are there is merged with labels dataset
    dfn = pd.merge(df, df2, on = 'tweetids').drop_duplicates(subset = ['tweetids'])
    # during merge there may not be all tweets available
    print("Number of records seen after merging with labels:",len(dfn))
    # get the count of intersection of dataset with tweetids and label dataset so as to verify the merge step.
    print("Actual number of common ids seen: ",len(set(df['tweetids'].tolist()).intersection(set(df2['tweetids'].tolist()))))
    # check the not foudn tweetids
    nf =  [each for each in tweetids if not each in dfn['tweetids'].tolist()]
    #
    print("Not found tweets count:",len(nf))
    # get the categories list from the merged dataframe
    allcategories = dfn['categories'].tolist()
    # unlisting list of lists. so as to get the unique number of labels
    allabels = list(itertools.chain(*allcategories))
    # these are the unqie number of labels with their respective count in the dataframe
    labels = list(collections.Counter(allabels).keys())
    # creating a hashmap for the labels to list - one hot encoding
    labelmap = collections.defaultdict(list)
    # first fill each label key with 0 vector of size = dfn lenght
    for each in labels:
        labelmap[each] = [0]*len(dfn)
    # then for each in row cateorin in the dfn, whicever labels are presentin that row, assign 1
    for row in range(len(dfn)):
        for col in dfn.iloc[row]['categories']:
            labelmap[col][row] = 1
    # replace the each label col with the hashmap we created
    for key, val in labelmap.items():
        dfn[key] = val
    
    # before writing to file. clean the tweets
    tocleantweets = dfn['tweets'].tolist()
    # defininf the what to be removed from the tweets
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.RESERVED,  p.OPT.MENTION)
    # cleaned tweets after preprocessing 
    cleantweets = [p.clean(data) for data in tocleantweets]
    # wrting back to the dataframe
    dfn['cleanedtweets'] = cleantweets
    # drop the uncleaned tweets 
    dfn.drop('tweets', axis=1, inplace= True)
    # save as the csv file
    dfn.to_csv('Features/'+name+'_features.csv')
    print("File save as: {}_features.csv".format(name))
    print("-----------------------------------------------------------------------------")
    
    return 


In [22]:
for each in events:
    get_features(each, labeldf)

Files found: ['trecis2019-A-test.earthquakeCalifornia2014.json']
Number of tweet ids found:  128
Number of records seen after merging with labels: 0
Actual number of common ids seen:  0
Not found tweets count: 128
File save as: earthquakeCalifornia2014_features.csv
-----------------------------------------------------------------------------
Files found: ['trecis2019-A-test.floodChoco2019.json']
Number of tweet ids found:  674
Number of records seen after merging with labels: 0
Actual number of common ids seen:  0
Not found tweets count: 854
File save as: floodChoco2019_features.csv
-----------------------------------------------------------------------------
Files found: ['trecis2019-A-test.hurricaneFlorence2018.json']
Number of tweet ids found:  2500
Number of records seen after merging with labels: 0
Actual number of common ids seen:  0
Not found tweets count: 2500
File save as: hurricaneFlorence2018_features.csv
----------------------------------------------------------------------