In [1]:
import pandas as pd
import numpy as np
from utils import remove_unicode, remove_digits, remove_punctuation, remove_stopwords

In [2]:
df = pd.read_table("tagged.tsv")

In [3]:
df.head()

Unnamed: 0,title,content_urls,description,last_updated,n_comments,n_votes,prerequisites,section,speaker_info,speaker_links,target_audience,type,year,proposal_type,selected
0,Make your own Packet Sniffer,GitHub,A Sniffer is a computer program or a piece of ...,"29 May, 2015",0,8,,Network Programming,,GitHub,Intermediate,Workshops,2015,,False
1,Building NextGen IoT solutions using Python an...,Draft of the PowerPoint\n[work in progress],"In this topic, we will use Raspberry Pi B+ run...","11 Sep, 2015",0,13,Basic Knowledge about Communication Protocols ...,Embedded Python,"Abhishek Narain – Technical Evangelist, Micros...",Abhishek Narain - https://twitter.com/narainab...,Beginner,Talks,2015,,False
2,Integrating Django with Centralised Identity M...,http://www.freeipa.org/page/Web_App_Authentica...,Most Django developers are familiar with authe...,"29 May, 2015",0,1,Attendees should have a passing familiarity wi...,Security,Fraser works at Red Hat on the FreeIPA central...,,Beginner,Talks,2015,,False
3,An introduction to Computer Vision using OpenC...,Code:,Computer Vision is the process of extracting r...,"10 Jun, 2015",0,1,,Scientific Computing,I am a second year student of BMS College of E...,GitHub : https://github.com/Specas,Beginner,Talks,2015,,False
4,Behavior-driven development | Web Testing auto...,,Many of us work for startups. And as the web a...,"30 May, 2015",0,0,,Testing,Shekhar <3 building products using open source...,He maintains few open source project and contr...,Intermediate,Workshops,2015,,False


In [5]:
text_cols = "title speaker_info section target_audience type prerequisites description".split()
pipe = [remove_unicode, remove_digits, remove_punctuation, remove_stopwords, lambda x: x.lower()]
for col in text_cols:
    s = df.pop(col)
    for cleaner in pipe:
        s = s.astype(str).apply(cleaner)
    df[col] = s

In [6]:
df.head()

Unnamed: 0,content_urls,last_updated,n_comments,n_votes,speaker_links,year,proposal_type,selected,title,speaker_info,section,target_audience,type,prerequisites,description
0,GitHub,"29 May, 2015",0,8,GitHub,2015,,False,make packet sniffer,,network programming,intermediate,workshops,,a sniffer computer program piece computer hard...
1,Draft of the PowerPoint\n[work in progress],"11 Sep, 2015",0,13,Abhishek Narain - https://twitter.com/narainab...,2015,,False,building nextgen iot solutions using python cloud,abhishek narain technical evangelist microsof...,embedded python,beginner,talks,basic knowledge communication protocols http p...,in topic use raspberry pi b running linux leve...
2,http://www.freeipa.org/page/Web_App_Authentica...,"29 May, 2015",0,1,,2015,,False,integrating django centralised identity manage...,fraser works red hat freeipa centralised ident...,security,beginner,talks,attendees passing familiarity http the\napache...,most django developers familiar authentication...
3,Code:,"10 Jun, 2015",0,1,GitHub : https://github.com/Specas,2015,,False,an introduction computer vision using opencvpy...,i second year student bms college engineering ...,scientific computing,beginner,talks,,computer vision process extracting relevant in...
4,,"30 May, 2015",0,0,He maintains few open source project and contr...,2015,,False,behaviordriven development web testing automa...,shekhar building products using open source t...,testing,intermediate,workshops,,many us work startups and web application grow...


In [7]:
df['speaker_link_present'] = False
df['content_url_present'] = False

In [9]:
for col in df:
    if df[col].dtype is np.dtype('O'):
        df[col].fillna(value="", inplace=True)

In [10]:
URL_PATTERN = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
df.loc[df.speaker_links.str.contains(URL_PATTERN, case=False), "speaker_link_present"] = True
df.loc[df.content_urls.str.contains(URL_PATTERN, case=False), "content_url_present"] = True

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [11]:
df.content_url_present.sum() / float(df.shape[0])

0.20344827586206896

In [12]:
del df['content_urls']
del df['speaker_links']

In [13]:
df['last_updated'] = pd.to_datetime(df.last_updated)

In [14]:
df.head()

Unnamed: 0,last_updated,n_comments,n_votes,year,proposal_type,selected,title,speaker_info,section,target_audience,type,prerequisites,description,speaker_link_present,content_url_present
0,2015-05-29,0,8,2015,,False,make packet sniffer,,network programming,intermediate,workshops,,a sniffer computer program piece computer hard...,False,False
1,2015-09-11,0,13,2015,,False,building nextgen iot solutions using python cloud,abhishek narain technical evangelist microsof...,embedded python,beginner,talks,basic knowledge communication protocols http p...,in topic use raspberry pi b running linux leve...,True,False
2,2015-05-29,0,1,2015,,False,integrating django centralised identity manage...,fraser works red hat freeipa centralised ident...,security,beginner,talks,attendees passing familiarity http the\napache...,most django developers familiar authentication...,False,True
3,2015-06-10,0,1,2015,,False,an introduction computer vision using opencvpy...,i second year student bms college engineering ...,scientific computing,beginner,talks,,computer vision process extracting relevant in...,True,False
4,2015-05-30,0,0,2015,,False,behaviordriven development web testing automa...,shekhar building products using open source t...,testing,intermediate,workshops,,many us work startups and web application grow...,True,False


In [15]:
deadline_16 = pd.to_datetime("1 July 2016")
deadline_15 = pd.to_datetime("1 June 2015")

In [16]:
df['deadlinediff'] = 0
df.loc[df.year == 2015, "deadlinediff"] = deadline_15 - df[df.year == 2015]['last_updated']
df.loc[df.year == 2016, "deadlinediff"] = deadline_16 - df[df.year == 2016]['last_updated']

In [17]:
del df['last_updated']

In [18]:
df.head()

Unnamed: 0,n_comments,n_votes,year,proposal_type,selected,title,speaker_info,section,target_audience,type,prerequisites,description,speaker_link_present,content_url_present,deadlinediff
0,0,8,2015,,False,make packet sniffer,,network programming,intermediate,workshops,,a sniffer computer program piece computer hard...,False,False,3 days
1,0,13,2015,,False,building nextgen iot solutions using python cloud,abhishek narain technical evangelist microsof...,embedded python,beginner,talks,basic knowledge communication protocols http p...,in topic use raspberry pi b running linux leve...,True,False,-102 days
2,0,1,2015,,False,integrating django centralised identity manage...,fraser works red hat freeipa centralised ident...,security,beginner,talks,attendees passing familiarity http the\napache...,most django developers familiar authentication...,False,True,3 days
3,0,1,2015,,False,an introduction computer vision using opencvpy...,i second year student bms college engineering ...,scientific computing,beginner,talks,,computer vision process extracting relevant in...,True,False,-9 days
4,0,0,2015,,False,behaviordriven development web testing automa...,shekhar building products using open source t...,testing,intermediate,workshops,,many us work startups and web application grow...,True,False,2 days


In [19]:
df.deadlinediff.min()

Timedelta('-370 days +00:00:00')

In [20]:
df['deadlinediff'] = df.deadlinediff.apply(lambda x: x.days)

AttributeError: 'NaTType' object has no attribute 'days'

In [20]:
df.head()

Unnamed: 0,n_votes,n_comments,year,title,speaker_info,section,target_audience,type,prerequisites,description,speaker_link_present,content_url_present,deadlinediff
0,58,0,2015,consuming government data python d,pratap vardhan data scientist gramenercom data...,data visualization analytics,intermediate,talks,,the explosion open data especially government ...,False,False,-94
1,19,1,2015,dont get scared get started,tapasweni pathaki done bachelors it igdtuw i w...,others,beginner,talks,nothingother passion coding,opensource world full excitement knowledge enc...,False,False,1
2,4,0,2015,distributed scheduling leveraging multiple nod...,i software engineer red hat inc working gluste...,concurrency,beginner,talks,a basic understanding distributed system works...,setting cron job machine perhaps easiest way s...,True,True,35
3,22,0,2015,analyzing python code pylint,im open source enthusiast coming romania lead ...,others,intermediate,talks,the participants basic understanding python no...,given dynamic nature python bugs tend creep co...,False,False,-112
4,70,0,2015,python metaprogramming macros madness more,suhas data scientist gramener previously engin...,core python,intermediate,talks,,summaryever wanted conquer world fell short kn...,False,False,-94


In [21]:
sel_2015 = """
        simple hacks make your django website faster
        pretty printing in python
        machine learning techniques for building a large scale
        laying out your django projects
        python and riakdb
        building flexible filesystems with fuse-python
        symengine: the future fast core of computer algebra systems
        test driven development with ansible
        explore big data using simple python code
        introduction to nipype and how do we create
        python load balancer: 0 to 1 million requests per second
        creating, deployment & customizing
        building nextgen iot solutions
        consuming government data with python and d3
        python traceback for humans
        how to build microservices using zeromq and wsgi
        rip nagios. hello docker shinken
        building offensive web security framework in python
        how to detect phishing urls using pyspark decision trees
        fedmsg: the message bus of fedora infrastructure
        concurrent data processing in python
        analyzing arguments during a debate using natural language processing
        avoiding common pitfalls of datetime from a webapp
        python 2 metaprogramming, macros, madness & more
        rest apis - what, why and how
        solving logical puzzles with natural language processing
        getting started with ansible
        let's learn statistics
        using devstack to contribute to openstack
        building nextgen iot solutions using python and cloud
        reasoning under uncertainty with python
        python on your mobile phone(advanced concepts)
        django projects the right way
        symbolic computation with python, sympy
        thinking in functions
        """

sel_2016 = """
        hacking the python ast
        helix and salt: case study in high volume and distributed python applications
        realtime microservices with server side flux
        building an automatic keyphrase extraction system using nltk
        testing native binaries using cffi and py.test
        the trends in choosing licenses in python ecosystems
        good bye, call stack; hello, event driven architectures
        algorithmic music generation
        python byte code hacks
        load testing using locust.io
        continuous integration for data scientists
        building companion chatbot with python
        deploying your python backend with
        big data analysis using pyspark
        flying a drone
        containerize upstream projects effortlessly
        financial modelling and simulation with python
        micropython - porting python to microcontrollers
        creating a recommendation engine based on nlp and contextual
        open source health monitoring and evaluation systems
        concurrency in modern robots
        building a secure iot platform using paho and flask
        don't write tests, generate them
        real time sentiment analysis with apache storm and python
        building a lie detector: multi-modal sentiment analysis
        docker workshop
        talking to machines: optimizing neural networks with theano
        productive coding with pycharm
        demystifying the django rest framework
        scaling django with kubernetes
        """

sel_2015 = [l.rstrip().lstrip() for l in sel_2015.splitlines() if l]
sel_2016 = [l.rstrip().lstrip() for l in sel_2016.splitlines() if l]

In [22]:
df['selected'] = False

In [23]:
sel_2015 = [l for l in sel_2015 if l]
sel_2016 = [l for l in sel_2016 if l]

In [25]:
for proposal in sel_2015:
    proposal = remove_stopwords(proposal)
    proposal = remove_punctuation(proposal)
    sdf = df[df.year == 2015][df.title.str.contains(proposal, case=False)]
    if sdf.shape[0] != 1:
        print("2015", proposal)
    else:
        df.loc[sdf.index[0], "selected"] = True
for proposal in sel_2016:
    proposal = remove_stopwords(proposal)
    proposal = remove_punctuation(proposal)
    sdf = df[df.year == 2016][df.title.str.contains(proposal, case=False)]
    if sdf.shape[0] != 1:
        print("2016", proposal)
    else:
        df.loc[sdf.index[0], "selected"] = True



2015 simple hacks make django website faster
2015 python riakdb
2015 symengine future fast core computer algebra systems
2015 python load balancer 0 1 million requests per second
2015 consuming government data python d3
2015 python traceback humans
2015 fedmsg message bus fedora infrastructure
2015 python 2 metaprogramming macros madness 
2016 trends choosing licenses python ecosystems
2016 flying drone
2016 talking machines optimizing neural networks theano




In [52]:
df[df.year == 2016][df.title.str.contains("theano", case=False)]

  if __name__ == '__main__':


Unnamed: 0,n_votes,n_comments,year,title,speaker_info,section,target_audience,type,prerequisites,description,speaker_link_present,content_url_present,deadlinediff,selected
174,19,1,2016,theano keras teaching python learn english,i free software enthusiast researcher computer...,scientific computing,intermediate,talks,understanding machine learning algorithms favo...,the main intention talk introduce people thean...,False,False,20,False
214,6,0,2016,talking machines optimizing neural networks t...,deep learning robotics enthusiast presently fi...,scientific computing,intermediate,workshops,while topics introduced talk scratch familiari...,with recent advances field deep learning compu...,False,False,-27,False


In [53]:
df.loc[214, "selected"] = True

In [54]:
df.to_csv("tagged.tsv", index=False, sep="\t")