# Autosuggestion Collection
This function handles the core process of collecting autosuggestion data from Google or Bing.

In [6]:
import requests
import urllib

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        print urllib.urlencode({'q': query, 'hl': lang})
        url = 'http://www.google.'+tld+'/complete/search?&client=chrome&%s' % (urllib.urlencode({'q': query, 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.urlencode({'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

In [8]:
import pandas as pd
import datetime
import goslate
import itertools
import os

required_countries = {"brazil", "india", "united states"}

evaluators = {"brazil":["brazil", "india", "argentina", "paraguay", "united kingdom", "united states"],
              "india":["india", "brazil", "pakistan", "nepal", "united kingdom", "united states"],
              "united states":["united states", "india", "brazil", "mexico", "canada", "united kingdom"]
             }

nationalities = {"united states":"american",
               "india":"indian",
              "brazil":"brazilian",
              "argentina":"argentinian",
              "paraguay": "Paraguayan",
              "united kingdom":"british",
              "pakistan":"pakistani",
              "nepal": "nepali",
              "mexico":"mexican",
              "canada":"canadian"
            }

hlds = {"united states":"com",
               "india":"co.in",
              "brazil":"com.br",
              "argentina":"com.ar",
              "paraguay": "com.py",
              "united kingdom":"co.uk",
              "pakistan":"com.pk",
              "nepal": "com.np",
              "mexico":"com.mx",
              "canada":"ca"
             }

languages = {  "united states":"en",
              "india":"en",
              "brazil":"pt",
              "argentina":"es",
              "paraguay": "es",
              "united kingdom":"en",
              "pakistan":"ur",
              "nepal": "ne",
              "mexico":"es",
              "canada":"en"
             }

queries_object = { "why is ",
                   "why are"
                     }

queries_subject = {"men",
                   "women",
                   "cities",
                   "tourists"
                  }

def send_query(query, hld, language, path, csv_name):
    #try:
    suggestions = collect_autosuggestions("google", hld, language, query)

    suggestions_df = pd.DataFrame({"suggestion": suggestions})
    suggestions_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    suggestions_df["search_term"] = query
    suggestions_df["tld"] = hld
    suggestions_df["language"] = language
    suggestions_df["engine"] = "google"
    suggestions_df.to_csv(os.path.join(path, csv_name + ".csv"), mode='a', header=False)

    #except:
     #   print "Query Failed"

def get_list_country(country, query):
    for cur_country in evaluators[country]:
        hld = hlds[cur_country]
        language = languages[cur_country]
        
        query_list = []
        if query == "query1":
            query_list = ["why", "is", country]
        elif query == "query2":
            query_list = ["why", "are", nationalities[country]]
        elif query == "query3":
            query_list = ["why", "are", nationalities[country], "men"]
        elif query == "query4":
            query_list = ["why", "are", nationalities[country], "women"]
        elif query == "query5":
            query_list = ["why", "are", nationalities[country], "tourists"]
        elif query == "query6":
            query_list = ["why", "are", nationalities[country], "cities"]
            
        for permutation in itertools.permutations(query_list):
            #print permutation
            query_string = ""
            for item in permutation:
                query_string+=str(item) + " "
            
            #gs = goslate.Goslate(writing=goslate.WRITING_ROMAN)
            #query = gs.translate(query_string, language)    
            print query_string
            language = 'en'
            send_query(query_string, hld, language, country+ "/" + query + "/", "suggestions_" + query + "_"+cur_country)
        

#Why is {country}
get_list_country("brazil", "query1")
get_list_country("india", "query1")
get_list_country("united states", "query1")

#Why are {nationality}
get_list_country("brazil", "query2")
get_list_country("india", "query2")
get_list_country("united states", "query2")

#Why are {nationality} men
get_list_country("brazil", "query3")
get_list_country("india", "query3")
get_list_country("united states", "query3")


#Why are {nationality} women
get_list_country("brazil", "query4")
get_list_country("india", "query4")
get_list_country("united states", "query4")

#Why are {nationality} tourists
get_list_country("brazil", "query5")
get_list_country("india", "query5")
get_list_country("united states", "query5")

#Why are {nationality} cities
get_list_country("brazil", "query6")
get_list_country("india", "query6")
get_list_country("united states", "query6")


#Uncomment to get the translation

#gs = goslate.Goslate(writing=goslate.WRITING_ROMAN)
#query = gs.translate('why is china ', 'zh-CN')
#u"Wèishéme zhōngguó".encode('utf-8')
#send_query('Why china is', 'cn', 'zh-CF')
        


('why', 'is', 'brazil')
q=why+is+brazil+&hl=en
('why', 'brazil', 'is')
q=why+brazil+is+&hl=en
('is', 'why', 'brazil')
q=is+why+brazil+&hl=en
('is', 'brazil', 'why')
q=is+brazil+why+&hl=en
('brazil', 'why', 'is')
q=brazil+why+is+&hl=en
('brazil', 'is', 'why')
q=brazil+is+why+&hl=en
('why', 'is', 'brazil')
q=why+is+brazil+&hl=en
('why', 'brazil', 'is')
q=why+brazil+is+&hl=en
('is', 'why', 'brazil')
q=is+why+brazil+&hl=en
('is', 'brazil', 'why')
q=is+brazil+why+&hl=en
('brazil', 'why', 'is')
q=brazil+why+is+&hl=en
('brazil', 'is', 'why')
q=brazil+is+why+&hl=en
('why', 'is', 'brazil')
q=why+is+brazil+&hl=en
('why', 'brazil', 'is')
q=why+brazil+is+&hl=en
('is', 'why', 'brazil')
q=is+why+brazil+&hl=en
('is', 'brazil', 'why')
q=is+brazil+why+&hl=en
('brazil', 'why', 'is')
q=brazil+why+is+&hl=en
('brazil', 'is', 'why')
q=brazil+is+why+&hl=en
('why', 'is', 'brazil')
q=why+is+brazil+&hl=en
('why', 'brazil', 'is')
q=why+brazil+is+&hl=en
('is', 'why', 'brazil')
q=is+why+brazil+&hl=en
('is', 'brazi

KeyboardInterrupt: 