In [1]:
# Default imports.
import numpy as np 
from numpy import random as rnd 
from matplotlib import pyplot as plt 
import os,sys,datetime,time,warnings,itertools,math 

# Always need pandas.
import pandas as pd 

# Lanchain OpenAI imports.
from langchain.llms import OpenAI
from langchain import ConversationChain
from langchain import PromptTemplate

# SPARQL Wrapper
from SPARQLWrapper import SPARQLWrapper,JSON

In [2]:
# Load platform-dependent API key file.
if sys.platform=='win32':
    api_key_file_target = r'C:\\Users\\z003mxpm\\Desktop\\Library_SHS\\keys\\langchain.txt'
else:
    api_key_file_target = r'/Users/janlucasdeinhard/Local Keystore/langchain.txt'
# Load API key file on system.
api_keys = {}
with open(api_key_file_target,'r') as f:
    text = f.readline()
    while len(text)>0:
        linelist = [k.strip() for k in text.split(':')]
        assert len(linelist)==2,'Target file corrupted!'
        api_keys[linelist[0]] = linelist[1]
        text = f.readline()
# Add API key to path.
os.environ['OPENAI_API_KEY'] = api_keys['CITY_EVOLUTION_KEY']

In [116]:
# Configure LLM with GPT4 architecture.
#llm = OpenAI(model_name='gpt-4-1106-preview')
llm = OpenAI(model_name='gpt-4')
conv = ConversationChain(
    llm=llm,
    verbose = False
)



In [126]:
# Instruct the model for desired output. Further instructions go here.
print(conv.predict(input='''
    This is a fine-tuning instruction. You will now act as a question-answer machine and restrict your answers to one-word responses whereever possible. Specifically I want you
    to return the closest integer year if I ask you a date. If the city was founded BCE, you give me the negative integer. Please be extra careful about this! 
    Along with the founding date in this format, I want you
    output the confidence you have in this date, on a scale from 0 to 1, all right? Please use forward slash as a separator.
'''))

Understood.


In [127]:
# Prepare prompt template.
prompt = PromptTemplate(
    input_variables=['City'],
    template='''
        When was the city of {City} first mentioned in historical records?
    '''
)

In [128]:
# Prepare prompt template.
prompt = PromptTemplate(
    input_variables=['City1','City2','City3','City4','City5','City6','City7','City8','City9','City10'],
    template='''
        When were the following cities first mentioned in historical records? {City1},{City2},{City3},{City4},{City5},{City6},{City7},{City8},{City9},{City10}
    '''
)

In [129]:
resp = conv.predict(input=prompt.format(
     City1='Tokyo'
    ,City2='Jakarta'
    ,City3='Delhi'
    ,City4='Mumbai'
    ,City5='Manila'
    ,City6='Seoul'
    ,City7='Cairo'
    ,City8='Dhaka'
    ,City9='Bangkok'
    ,City10='Moscow'
))

print(resp)

Tokyo/1603/0.9, Jakarta/397/0.75, Delhi/-2800/0.6, Mumbai/-1500/0.8, Manila/1571/0.95, Seoul/-18/0.8, Cairo/969/0.9, Dhaka/1608/0.8, Bangkok/1378/0.7, Moscow/1147/0.85


In [37]:
# Determine path on platform
if sys.platform=='win32':
    file_path = r'..\\..\\..\\TF_Data\\Dropbox\\TF_data\\world_cities\\'
else:
    raise Exception('Please configure a file path for this platform.')
# Load database of cities
df = pd.read_csv(file_path+'worldcities.csv')

In [40]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [60]:
sparql.setQuery("""
SELECT DISTINCT ?city ?cityLabel ?country ?countryLabel ?continent ?continentLabel
WHERE {
  ?city wdt:P31/wdt:P279* wd:Q515.
  ?city wdt:P17 ?country.            
  ?country wdt:P30 ?continent.       
  ?city wdt:P1082 ?population.

  FILTER (?population > 10000)  
  FILTER (?continent IN (wd:Q46, wd:Q48))  # Filter for Europe (Q46) or Asia (Q48)

  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en".
  }
}
ORDER BY ?continentLabel ?countryLabel ?cityLabel
""")
sparql.setReturnFormat(JSON)
sparql_results = sparql.query().convert()
wds_cities = pd.json_normalize(sparql_results['results'],record_path=['bindings'])

In [102]:
wds_cities = wds_cities.rename(columns={
    'cityLabel.value':'city',
    'continentLabel.value':'continent',
    'countryLabel.value':'country'
})

In [105]:
rf = df.merge(
    wds_cities,
    on=['city','country'],how='inner'
)

rf = rf.drop(columns=[
    'country.value',
    'countryLabel.xml:lang',
    'countryLabel.type',
    'continent.type',
    'continent.value',
    'continentLabel.xml:lang',
    'continentLabel.type',
    'city.value',
    'cityLabel.xml:lang',
    'cityLabel.type',
    'id',
    'city.type',
    'country.type'	
]).drop_duplicates()

In [113]:
L = []
MAXCOUNTER = rf.shape[0]

ctr = 0
for k,v in rf.iterrows():
    resp = conv.predict(input=prompt.format(City=v['city']))
    L.append(resp)
    if ctr%100==0: print('{0} / {1} completed'.format(ctr,MAXCOUNTER))
    ctr += 1
    #print('City: {0}  -->  Model Reponse: {1}'.format(v['city'],resp))

0 / 4845 completed
100 / 4845 completed


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4-1106-preview in organization org-ETO7hv3Zc6cvPXSTScQLVTnL on tokens per day (TPD): Limit 500000, Used 498556, Requested 5172. Please try again in 10m44.198s. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4-1106-preview in organization org-ETO7hv3Zc6cvPXSTScQLVTnL on tokens per day (TPD): Limit 500000, Used 498531, Requested 5172. Please try again in 10m39.878s. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-4-1106-preview in organization org-ETO7hv3Zc6cvPXSTScQLVTnL on tokens pe

RateLimitError: Rate limit reached for gpt-4-1106-preview in organization org-ETO7hv3Zc6cvPXSTScQLVTnL on tokens per day (TPD): Limit 500000, Used 498375, Requested 5172. Please try again in 10m12.921s. Visit https://platform.openai.com/account/rate-limits to learn more.

In [145]:
for chunk_idx in range(0,rf.shape[0],10):
    try:
        print(rf[chunk_idx:chunk_idx+10]['city'].values.tolist())
    except:
        print(rf[chunk_idx:]['city'].values.tolist())

['Tokyo', 'Jakarta', 'Delhi', 'Mumbai', 'Manila', 'Seoul', 'Cairo', 'Dhaka', 'Bangkok', 'Moscow']
['Moscow', 'Istanbul', 'Istanbul', 'Karachi', 'Bangalore', 'Ho Chi Minh City', 'Tehran', 'Chennai', 'Lahore', 'London']
['Paris', 'Nagoya', 'Taipei', 'Kuala Lumpur', 'Hanoi', 'Pune', 'Ahmedabad', 'Riyadh', 'Madrid', 'Baghdad']
['Singapore', 'Prayagraj', 'Giza', 'Ankara', 'Ankara', 'Saint Petersburg', 'Saint Petersburg', 'Alexandria', 'Barcelona', 'Berlin']
['İzmir', 'İzmir', 'Kabul', 'Amman', 'Jeddah', 'Yokohama', 'Busan', 'Lucknow', 'Mashhad', 'Dubai']
['Faisalabad', 'Jaipur', 'Jaipur', 'Athens', 'Taichung', 'Kuwait City', 'Budapest', 'Quezon City', 'Kyiv', 'Sanaa']
['Incheon', 'Birmingham', 'Bursa', 'Bursa', 'Rome', 'Pyongyang', 'Stuttgart', 'Kaohsiung', 'Lisbon', 'Manchester']
['Munich', 'Tashkent', 'Hamburg', 'Daegu', 'Antalya', 'Antalya', 'Colombo', 'Baku', 'Baku', 'Fukuoka']
['Konya', 'Konya', 'Phnom Penh', 'Haiphong', 'Rawalpindi', 'Vadodara', 'Gujranwala', 'Gaziantep', 'Gaziantep',

4845