In [3]:
import os
import pandas as pd
from collections import defaultdict, Counter
from nltk.util import ngrams

# Load resources

In [4]:
# Set(location)
gazetteer = set()
with open('../resource/gazetteer.txt') as f:
    for line in f.readlines():
        gazetteer.add(line.strip().lower())
        
# Set(common words)
common_word = set()
with open('../resource/common_word_10000.txt') as f:
    for line in f.readlines():
        word = line.strip()
        common_word.add(word)
# Set(location) / Set(common words)
for word in common_word:
    if word in gazetteer:
        gazetteer.remove(word)
gazetteer.remove('corona')

# Dict[Abbr] -> full name
abbr = {}
with open('../resource/abbr.txt') as f:
    for line in f.readlines():
        line = line.strip().split()
        abbr[line[-1]] = ' '.join(line[:-1])

# Dict[location] -> country code
gazetteer_map = {}
with open('../resource/allCountries.txt') as f:
    for line in f.readlines():
        line = line.strip().lower().split('\t')
        if line[2] in gazetteer_map:
            if line[-4] > gazetteer_map[line[2]][0]:
                gazetteer_map[line[2]] = (line[-4],line[8])
        else:
            gazetteer_map[line[2]] = (line[-4],line[8])

# Dict[country code] -> country 
country_map = {}
with open('../resource/countryInfo.txt') as f:
    for line in f.readlines():
        if line[0] != '#':
            line = line.lower().split('\t')
            country_map[line[0]] = line[4]

# Load Query Data

In [5]:
raw_data = []
for fname in sorted(os.listdir('../data/')):
    if fname.startswith('QueriesByCountry'):
        raw_data.append(pd.read_csv(os.path.join('../data/',fname), sep='\t'))
        df = pd.concat(raw_data, axis=0)
print (f'Number of records: {len(df)}')

Number of records: 1644205


# Data Pre-process

In [6]:
# remove abbr
abbr_query_map = {}

def check_abbr(x):
    if x in abbr_query_map:
        return abbr_query_map[x]
    xs = x.split()
    for i,token in enumerate(xs):
        if token in abbr:
            xs[i] = abbr[token]
    newx = ' '.join(xs)
    abbr_query_map[x] = newx
    return newx

df['Query'] = df['Query'].apply(lambda x: check_abbr(x))

In [7]:
# detect loc
loc_query_map = {}
def detect_loc(x):
    if x in loc_query_map:
        return loc_query_map[x]
    xs = x.split()
    for i in range(len(xs)):
        for j in range(len(xs)+1,i,-1):
            piece = ' '.join(xs[i:j])
            if piece in gazetteer:
                loc_query_map[x] = piece
                return piece
    return None
df['Loc'] = df['Query'].apply(lambda x: detect_loc(x))

In [8]:
# detelt queries not include location
print (f'Number of records: {len(df)}')
df = df[~df['Loc'].isnull()]
print (f'Number of records contain location: {len(df)}')

Number of records: 1644205
Number of records contain location: 823379


In [9]:
# add country of loc
loc_country_map = {}
def loc_to_country(x):
    if x in loc_country_map:
        return loc_country_map[x]
    if x in country_map.values():
        loc_country_map[x] = x
        return x
    try:
        loc_country_map[x] = country_map[gazetteer_map[x][1]]
        return loc_country_map[x]
    except:
        loc_country_map[x] = None
        return None
df['LocCountry'] = df['Loc'].apply(lambda x: loc_to_country(x))

# Save queries contain location. 
df.to_csv('../results/QueryContainLoc.tsv',sep='\t',index=False)

In [11]:
# For queries from each country, split them into batches and count frequency of the location names for each batch
from collections import defaultdict

def get_word_freq(df,ngram=1):
    word_freq = defaultdict(int)
    for _,row in df.iterrows():
        for word in ngrams(row['Query'].split(), ngram):
            word_freq[word] += row['PopularityScore']
    word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
    return word_freq

def get_trend(country_df, country):
    batch = 10
    dates = sorted(list(set(country_df['Date'])))
    batch_size = len(dates)//batch
    batch_dates = [dates[i:i+batch_size] for i in range(0, len(dates), batch_size)][:batch]
    
    count = []
    for cur_batch_dates in batch_dates:
        cur_batch_dates = set(cur_batch_dates)
        batch_count = defaultdict(int)
        cur_df = country_df[country_df['Date'].isin(cur_batch_dates)]
        for idx, row in cur_df.iterrows():
            batch_count[row['LocCountry']] += row['PopularityScore'] 
        count.append(batch_count)
    return count
            

trend_by_country = {}
for country in set(df['Country']): 
    country_df = df[df['Country']==country]
    # filter the country with few records
    if len(country_df) < 1000:
        continue
    
    print (f'Country: {country:20} Number of records: {len(country_df)}') 
    trend = get_trend(country_df, country)
    unigram = get_word_freq(country_df, 1)
    bigram = get_word_freq(country_df, 2)
    trend_by_country[country] = {'trend':trend, 'unigram':unigram, 'bigram':bigram}
    
# Save to trend
import json
with open('../results/trend.json','w') as f:
    json.dump(trend_by_country, f, indent=4)

Country: China                Number of records: 4482
Country: Netherlands          Number of records: 3349
Country: Brazil               Number of records: 8326
Country: Sweden               Number of records: 1475
Country: Canada               Number of records: 38027
Country: Spain                Number of records: 20739
Country: United States        Number of records: 437962
Country: Austria              Number of records: 1753
Country: Italy                Number of records: 47089
Country: Switzerland          Number of records: 4375
Country: Australia            Number of records: 15716
Country: Germany              Number of records: 46664
Country: Mexico               Number of records: 6483
Country: Argentina            Number of records: 2234
Country: India                Number of records: 9163
Country: South Africa         Number of records: 3416
Country: Malaysia             Number of records: 1128
Country: United Kingdom       Number of records: 79921
Country: Belgium    

In [None]:
# # Word Count


# def output_freq(unigram_freq,bigram_freq,fname):
#     with open('unigram_'+fname, 'w') as fout:
#         for w,f in unigram_freq:
#             fout.write(f"{' '.join(w):30}\t{f}\n")
#     with open('bigram_'+fname, 'w') as fout:
#         for w,f in bigram_freq:
#             fout.write(f"{' '.join(w):30}\t{f}\n")
                       
# unigram_freq = get_word_freq(data,1)
# bigram_freq = get_word_freq(data,2)
# output_freq(unigram_freq,bigram_freq,fname.lower())