In [1]:
query = ':)'
language = 'en'
metric = ['rank','counts','freq']

In [2]:
import pandas as pd
import numpy as np
import flask
import datetime as dt
import pymongo
import json
import datetime
from dotenv import load_dotenv
load_dotenv()
import os
password = os.getenv("PASSWORD")
username = os.getenv("USERNAME")

In [5]:
def get_data(query):
    errs=[]
    try:
        # Connect to mongo using the credentials from .env file
        client = pymongo.MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
        # Select the location based on the wordcount (1grams, 2grams, 3grams, etc.), by counting spaces
        ngram=str(query.count(' ')+1)+'grams'
        db = client[ngram]
        print("connected to mongo client "+ngram)
    except:
        return str("Couldn't connect to the "+language+" "+ngram+" database")
    output=dict()
    output['word']=query
    def add_err(e,errors):
        if len(errors) > 0:
            errs = [x + e for x in errors]
            print("adding ",x," to errs list")
            print(errs)
        else:
            errs=[e]
    try:
        # Build df
        df = pd.DataFrame(list(db[language].find({"word": query})))
        df = df.dropna(how='all')
        try:
            df = df.sort_values(by=['time'])
            df['year'] = [date.year for date in df['time']]
            df['day'] = [date.timetuple().tm_yday for date in df['time']]
            # Pull out beginning part of date (YY-MM-DD)
            df['time'] = [str(t)[:10] for t in df['time']]
            # Convert date to date object
            df['time'] = [dt.datetime.strptime(t, '%Y-%m-%d').date() for t in df['time']]
            # Remove dates before 2010
            df=df[df['time']>=(dt.date(2009,8,1))]
        except:
            output['dates']=[]
            add_err('error gathering dates',errs)
            pass
        # Calculate min, max, and median rank
        try:
            output['maxrank']=int(min(df['rank'].values))
        except:
            output['maxrank']=int(0)
            add_err('error computing maxrank',errs)
            pass
        try:
            output['minrank']=int(max(df['rank'].values))
        except:
            output['minrank']=int(0)
            add_err('error computing minrank',errs)
            pass
        try:
            output['medianrank']=int(np.round(np.median(df['rank'].values)))
        except:
            output['medianrank']=int(0)
            add_err('error computing medianrank',errs)
            pass
        # Index df by date
        df.set_index('time',inplace=True)
        #print('Indexed df by date')
        # Drop the id field (used for indexing in the database)
        df.drop(columns=["_id"]);
        #print('Dropped the id field (used for indexing in the database)')
        # Sort by date
        df.sort_values(by='time',ascending=True,inplace=True)
        #print('Sorted by date')
        # Convert time back to a string
        df.index=[t.strftime("%Y-%m-%d") for t in df.index]
        #print('Converted time back to a string')
        # Send dates and metrics as arrays to the output dict
        try:
            output['dates']=df.index.values.tolist()
        except:
            output['dates']=[]
            add_err('error formatting dates',errs)
            pass
        #print('Sent dates and metrics as arrays to the output dict')
        # Fill the requested metric values
        for item in ['rank','counts','freq']:
            print('Testing to see if ',item,' is in the list of requested metrics...')
            if item in metric:
                print('Found ',item,' in list of requested metrics')
                try:
                    output[item]=[int(r) for r in df[item].values] # Convert from int64 to Python integers
                except:
                    output[item]=[float(f) for f in df[item].values] # Convert from float64 Python float
                    add_err(str('Warning - output['+item+'] is a float: '+output[item][0:5]+'...'),errs)
    except:
        for item in ['rank','counts','freq']:
            if item in metric:
                output[item]=[]
        add_err(str("Couldn't find data for "+query+" in the "+language+" "+ngram+" database"),errs)
    if len(errs) > 0:
        output['error_count']=len(errs)
        output['errors']=errs
    # Send the object to a JSON file
    url = '../ui/data/'+query+'.json'
    with open(url, 'w') as outfile:
        print('opened ',url)
        json.dump(output, outfile)
        outfile.close()
        print('closed ',url)
    return output

In [6]:
get_data(':)')

connected to mongo client 1grams
opened  ../ui/data/:).json
closed  ../ui/data/:).json


{'word': ':)',
 'dates': [],
 'maxrank': 0,
 'minrank': 0,
 'medianrank': 0,
 'rank': [],
 'counts': [],
 'freq': []}

In [10]:
client = pymongo.MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
db = client["1grams"]
df = pd.DataFrame(list(db['en'].find({"word": ":)"})))

In [11]:
df