In [204]:
# http://www.kdd.org/exploration_files/KDDCUP2005Report_Shen.pdf
# http://research.microsoft.com/pubs/81350/sigir09.pdf
import pandas as pd
import numpy as np

import glob

from textblob import TextBlob
from nltk.tag import pos_tag


In [72]:
###########################################
# create a function to combine data files #
###########################################

def frame_masher():
    # find all of the query files
    query_files = glob.glob('data/Labeled800Queries/*')
    
    # empty list to add data frames to
    frame_list = []
    
    # iterate through the files to create one combined date frame
    for frame in query_files:
        h = ['query','lab1','lab2','lab3','lab4','lab5' ]
        df = pd.read_csv(frame,header=None, delimiter='\t')
        df.columns = h
        frame_list.append(df)
    joined = pd.concat(frame_list).reset_index()
    final = joined.drop('index',axis = 1)
    return final

In [74]:
# create the frame
df = frame_masher()

In [110]:
###############################################################
# create a function to count how many words are in each query #
###############################################################

def word_counter():
    
    # create an empty list to add each row's value 
    word_count = []
    
    # iterate through each query, split it up and count the words
    for i in df['query']:
        count = len(i.split())
        word_count.append(count)
    return word_count

In [117]:
######################################################################################
# create a function that informs us of whether or not there are numbers in the query #
######################################################################################

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [133]:
###########################################################
# create a function to apply the hasNumbers to each query #
###########################################################

def has_num():
    number = []
    for i in df['query']:
        num = hasNumbers(i)
        number.append(num)
    return number

In [139]:
#######################################################
# create a function to count the characters per query #
#######################################################

def char_counter():
    char_count = []
    for string in df['query']:
        count = len(string)
        char_count.append(count)
    return char_count 

In [202]:
#######################################
# count the parts of speech per query #
#######################################

def partOfSpeechCounter(p='NN'): # others include 'VB' and 'IN'
    part_count = []
    for q in df['query']:
        tagged = pos_tag(q.split())
        part = [word for word,pos in tagged if pos == p]
        part_count.append(len(part))
    return part_count

In [203]:
df['word_count'] = word_counter()
df['has_num'] = has_num()
df['char_count']=char_counter()
df['char_per_word'] =df['char_count']/df['word_count']

In [205]:
df['noun_count'] = partOfSpeechCounter()
df['verb_count'] = partOfSpeechCounter('VB')
df['prep_count'] = partOfSpeechCounter('IN')

In [207]:
df.to_csv('data/queries.csv') # write to csv