# Send data to CrowdFlower for crowd sourcing annotation

In [1]:
import pandas as pd
import re
import csv

#### define events
- titles, descriptions

In [2]:
event_metadata = {
    'UEFA Champions League Final' : "The 2016 UEFA Champions League Final was the final match of the 2015–16 UEFA Champions League, the 61st season of Europe's premier club football tournament organised by UEFA, and the 24th season since it was renamed from the European Champion Clubs' Cup to the UEFA Champions League. It was played at the San Siro stadium in Milan, Italy, on 28 May 2016,[5] between Spanish teams Real Madrid and Atlético Madrid, in a repeat of the 2014 final.",
    'French Open Finals' : "The 2016 French Open was a tennis tournament played on outdoor clay courts. It was the 115th edition of the French Open and the second Grand Slam event of the year. It took place at the Stade Roland Garros from 22 May to 5 June and consisted of events for professional players in singles, doubles and mixed doubles play.",
    'Monaco Grand Prix' : "The 2016 Monaco Grand Prix (formally known as the Formula 1 Grand Prix de Monaco 2016) was a Formula One motor race held on 29 May 2016 at the Circuit de Monaco, a street circuit that runs through the Principality of Monaco. It was the sixth round of the 2016 season, and marked the seventy-fourth time that the Monaco Grand Prix had been held, as well as the sixty-third time it had been held as a round of the Formula One World Championship since the series inception in 1950.",
    'Stanley Cup Playoffs Final Game 7' : "The 2016 Stanley Cup playoffs of the National Hockey League (NHL) began on April 13, 2016 and ended on June 12, 2016, with the Pittsburgh Penguins defeating the San Jose Sharks four games to two in the 2016 Stanley Cup Finals.",
    '24 Hours of Le Mans' : "The 84th 24 Hours of Le Mans (French: 84e 24 Heures du Mans) was an automobile endurance event held from 15 to 19 June 2016 at the Circuit de la Sarthe, Le Mans, France. It was the 84th running of the 24 Hour race organised by the Automobile Club de l'Ouest as well as the third round of the 2016 FIA World Endurance Championship.",
    'NBA Playoffs Finals Game 7': "The 2016 NBA Playoffs were the postseason tournament of the National Basketball Association's 2015-16 season. The tournament ended with the Eastern Conference champion Cleveland Cavaliers defeating the Western Conference champion Golden State Warriors 4 games to 3 in the 2016 NBA Finals.",
    'The Next Web Conference Europe' : "TNW Conferences are an annual set of conferences hosted by The Next Web in Amsterdam and New York. The first event took place in 2006.[10]",
    'Recode Code Conference' : "Code Conference is an annual invitation-only event hosted by Recode, and takes place in Dana Point, California in late May. The conference features top industry influencers who gather for in-depth conversations about the current and future impact of digital technology on our lives — at home, at work, in our communities and the world.",
    'Google I/O' : "Google I/O is an annual developer-focused conference held and organized by Google in San Francisco, California. Google I/O features highly technical, in-depth sessions focused on building web, mobile, and enterprise applications with Google and open web technologies such as Android, Chrome, Chrome OS, APIs, Google Web Toolkit, App Engine, and more.",
    'Apple Worldwide Developer Conference' : "The Apple Worldwide Developers Conference (WWDC), is a conference held annually in California by Apple Inc. The conference is used by Apple to showcase its new software and technologies for software developers.",
    'Lenovo Tech World' : "The 2016 Lenovo Tech World was the second edition of an annual technology conference held in San Fransisco, California. The first was held in Biejing, China in 2015, and the conference is used to showcase new technologies developed by Lenovo.",
    'Xbox E3' : "The Electronic Entertainment Expo 2016, commonly known as E3 2016, was the 22nd Electronic Entertainment Expo, during which several hardware manufacturers and software developers and publishers from the video game industry presented new and upcoming products to the attendees, primarily retailers and members of the video game press. The event, organized by the Entertainment Software Association, took place at the Los Angeles Convention Center from June 14 to 16, 2016."
    }

### get summary data

In [3]:
events = list(event_metadata.keys())

summary_types = [
    'engagements',
    'pr_subevents',
    'pr_e_subevents',
    'prplus_e_subevents'
    ]

data = {}
for event_name in events:
    event_filename = re.sub("\W+", "", event_name.replace('/', '').strip())
    data[event_name] = {}
    for summary_type in summary_types:
        d = pd.read_csv('data/final/summary/foreval_event_%s_summary_%s.txt' % (event_filename, summary_type), sep='\t', encoding='utf-8', header=0, parse_dates=['created_at'], dtype={'twitter_id' : 'str'})
        d['event'] = event_name
        d['summary_type'] = summary_type
        data[event_name][summary_type] = d
        
    df = pd.concat(data[event_name].values())
    data[event_name] = df

allevents = list(data.values())

df = pd.concat(allevents)
df.shape

(1200, 15)

In [4]:
df = df[['event', 'twitter_id', 'created_at', 'text']]

df['description'] = df['event']
df['description'].replace(event_metadata, inplace=True)

In [5]:
df.to_csv('data/final/summary/allsummaries_foreval.csv', sep=',', quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8', header=True, index=False)

In [6]:
df_deduped = df.drop_duplicates(subset='twitter_id')
df_deduped.shape

(886, 5)

In [7]:
df_deduped.to_csv('data/final/summary/allsummaries_foreval_deduped.csv', sep=',', quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8', header=True, index=False)

# OR...

### random samples of all data

In [None]:
data = {}
for event_name in event_metadata.keys():
    event_filename = re.sub("\W+", "", event_name.strip())
    data[event_name] = pd.read_csv('data/final/event_%s_data.txt' % event_filename, sep='\t', encoding='utf-8', header=0, parse_dates=['created_at'], dtype={'twitter_id' : 'str'})
    
allevents = list(data.values())

In [None]:
classified = pd.read_csv('data/final/final_data_classified.txt', sep='\t', encoding='utf-8', header=0)
classified.shape

#### random sample k range x 100 rows
- stack events by 100 rows
- where not in already classified
- where not in sample being created

In [None]:
samples = pd.DataFrame(columns=allevents[0].columns)

k = 1

for i in range(k):
    for eventdf in allevents:
        
        event_name = eventdf['event'].unique()[0]
        event_classified = classified[classified['event'] == event_name]
        eventdf = eventdf[eventdf['event_id'].isin(event_classified['event_id']) == False]
        
        samples = samples.append(eventdf[eventdf['twitter_id'].isin(samples['twitter_id']) == False].sample(n=100, replace=False))
        
samples.shape

In [None]:
samples = samples[['event', 'event_id', 'twitter_id', 'created_at', 'text']]

samples['description'] = samples['event']
samples['description'].replace(event_metadata, inplace=True)

In [None]:
samples.head(1)

In [None]:
samples.to_csv('data/final/events_annsample_forcrowdflower5.csv', sep=',', quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8', header=True, index=False)