In [14]:
# http://www.kdd.org/exploration_files/KDDCUP2005Report_Shen.pdf
# http://research.microsoft.com/pubs/81350/sigir09.pdf
# http://research.microsoft.com/pubs/79487/Query%20Enrichment%20for%20Web-query%20Classification.Shen.HKUST.TOIS.2006.Paper.pdf
%matplotlib inline

import pandas as pd
import numpy as np

import glob
import re

from textblob import TextBlob
from nltk.tag import pos_tag
from itertools import compress

import matplotlib.pyplot as plt
import plotly.plotly as py

In [2]:
###########################################
# create a function to combine data files #
###########################################

def frame_masher():
    # find all of the query files
    query_files = glob.glob('data/Labeled800Queries/*')
    
    # empty list to add data frames to
    frame_list = []
    
    # iterate through the files to create one combined date frame
    for frame in query_files:
        h = ['query','lab1','lab2','lab3','lab4','lab5' ]
        df = pd.read_csv(frame,header=None, delimiter='\t')
        df.columns = h
        frame_list.append(df)
    joined = pd.concat(frame_list).reset_index()
    final = joined.drop('index',axis = 1)
    return final

In [3]:
# create the frame
df = frame_masher()

In [4]:
###############################################################
# create a function to count how many words are in each query #
###############################################################

def word_counter():
    
    # create an empty list to add each row's value 
    word_count = []
    
    # iterate through each query, split it up and count the words
    for i in df['query']:
        count = len(i.split())
        word_count.append(count)
    return word_count

In [5]:
######################################################################################
# create a function that informs us of whether or not there are numbers in the query #
######################################################################################

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [6]:
###########################################################
# create a function to apply the hasNumbers to each query #
###########################################################

def has_num():
    number = []
    for i in df['query']:
        num = hasNumbers(i)
        number.append(num)
    return number

In [7]:
#######################################################
# create a function to count the characters per query #
#######################################################

def char_counter():
    char_count = []
    for string in df['query']:
        count = len(string)
        char_count.append(count)
    return char_count 

In [8]:
#######################################
# count the parts of speech per query #
#######################################

def partOfSpeechCounter(p='NN'): # others include 'VB' and 'IN'
    part_count = []
    for q in df['query']:
        tagged = pos_tag(q.split())
        part = [word for word,pos in tagged if pos == p]
        part_count.append(len(part))
    return part_count

In [9]:
###########################################################
# create a function to combine categories into one column #
###########################################################

def categoryCombine():
    
    # join the category columns from the df into one string
    cats = df[df.columns[1:6]].apply(lambda x: ','.join(x.dropna().astype(str).astype(str)),axis=1)
    
    # empty list 
    categories = []
    
    # transform string into list by splitting on commas
    for val in cats:
        splitted = val.split(',')
        categories.append(splitted)
    return categories

In [10]:
# find all super-categories
super_categories = ['Computers','Entertainment','Information','Living','Online','Shopping','Sports']

In [11]:
#############################################################
# create a function that returns true if the super-category #
# exists in the categories column and faluse otherwise      #
#############################################################

def superCatBool():
    sup_bool = []
    for i in df['categories']:
        sups = []
        for category in super_categories:
            sup = category in i
            sups.append(sup)
        sup_bool.append(sups)
    return sup_bool

In [12]:
#########################################################################
# a function to print out the super-categories associated with each row #
#########################################################################

def superCats():
    cats = []
    bools = superCatBool()
    for vec in bools:
        new = list(compress(super_categories, vec))
        cats.append(new)
    return cats

In [None]:
# convert bools to string for R format
def boolToString():
    new_cells = []
    for cell in df['sup_cat_bool']:
        list_ = []
        for i in cell:
            if i == True:
                x = 'TRUE'
            else:
                x = 'FALSE'
            list_.append(x)
        new_cells.append(list_)
    return new_cells

In [None]:
df['bools']=boolToString()

In [None]:
# create a way to create separate columns
def boolRowMaker(x):
    col = []
    for i in df['bools']:
        b = i[x]
        col.append(b)
    return col

In [15]:
df['word_count'] = word_counter()
df['has_num'] = has_num()
df['char_count']=char_counter()
df['char_per_word'] =df['char_count']/df['word_count']
df['categories'] = categoryCombine()
df['super_categories'] = superCats()
df['sup_cat_bool'] = superCatBool()

In [16]:
df['noun_count'] = partOfSpeechCounter()
df['verb_count'] = partOfSpeechCounter('VB')
df['prep_count'] = partOfSpeechCounter('IN')

KeyboardInterrupt: 

In [None]:
# create new columns
df['label1']= boolRowMaker(0)
df['label2']= boolRowMaker(1)
df['label3']= boolRowMaker(2)
df['label4']= boolRowMaker(3)
df['label5']= boolRowMaker(4)
df['label6']= boolRowMaker(5)
df['label7']= boolRowMaker(6)

In [None]:
#######################################################
# split a string column into multiple column function #
#######################################################

foo = lambda x: pd.Series([i for i in reversed(x.split(','))])

In [None]:
# apply foo to super_categories column
rev=df['super_categories'].apply(foo)

In [None]:
# rename columns
df['sup1'],df['sup2'],df['sup3'],df['sup4'] = rev[0],rev[1],rev[2],rev[3]

In [None]:
##################################################
# clean up some ugliness with some regex matches #
##################################################

def colCleaner(column):
    cleaned = []
    for cell in column:    
        x=cell.replace(r"[","")
        y = x.replace(r"'","")
        w = y.replace(r" ","")
        z = w.replace(r"]","")
        cleaned.append(z)
    return cleaned

In [None]:
# replace a column
df['sup1']=colCleaner(df['sup1'])

In [207]:
df.to_csv('data/queries.csv') # write to csv