In [1]:
from faker import Faker
import random
from generatorUtils import LabelTransformations

In [2]:
labels = ['city', 'place', 'location', 'destination', 'source']

In [3]:
class CityGenerator:
    def __init__(self, num_of_samples):
        self.num_of_samples = num_of_samples
        self.labelTransformations = LabelTransformations()
        self.locales = ['en_AU', 'en_CA', 'en_GB', 'en_IE', 'en_IN', 'en_NZ', 'en_PH', 'en_TH', 'en_US']

    def data_transform(self, input_city, output_city):
        city_transforms = [(input_city, output_city)]

        city_transforms.append((self.labelTransformations.__all_words_capital__([input_city])[0], \
        self.labelTransformations.__all_words_capital__([output_city])[0]))

        city_transforms.append((self.labelTransformations.__all_words_start_capital__([input_city])[0], \
        self.labelTransformations.__all_words_start_capital__([output_city])[0]))

        city_transforms.append((self.labelTransformations.__all_words_small__([input_city])[0], \
        self.labelTransformations.__all_words_small__([output_city])[0]))

        

        return random.choice(city_transforms)

    def generateCities(self, labels):
        labels = list(set(self.labelTransformations(labels)))

        city_inputs = []
        city_outputs = []
        for locale in self.locales:
            self.f = Faker(locale=locale)

            for i in range(self.num_of_samples):
                label = random.choice(labels)

                
                input_city = self.f.city()
                output_city = self.f.city()

                input_city, output_city = self.data_transform(input_city, output_city)
                
                city_inputs.append(label + ': ' + input_city)
                city_outputs.append(output_city)

        return city_inputs, city_outputs

In [4]:
cityGenerator = CityGenerator(num_of_samples=300)

In [5]:
input, output = cityGenerator.generateCities(labels)

In [6]:
list(zip(input, output))[:20]

[('Location: lake jenniferchester', 'kimberlyfurt'),
 ('PLACE: East Veronica', 'Acevedoview'),
 ('source: St. Isaac', 'North Melissa'),
 ('place: wardmouth', 'west jerrystad'),
 ('destination: north charlesview', 'zacharyborough'),
 ('Location: west rachel', 'st. stephanie'),
 ('CITY: melissaton', 'robertside'),
 ('LOCATION: South Steven', 'Port Emily'),
 ('destination: Port Timothyside', 'East Rachel'),
 ('City: Hesterberg', 'Davidmouth'),
 ('Place: Rossfurt', 'South John'),
 ('destination: aaronview', 'st. andrew'),
 ('Location: Lake Julie', 'New Jacobfort'),
 ('CITY: Wilsonchester', 'Ambermouth'),
 ('SOURCE: st. tyler', 'shirleyfort'),
 ('Location: St. Daniel', 'South Amy'),
 ('LOCATION: St. Allenmouth', 'Andersonbury'),
 ('city: Anthonyland', 'North Amychester'),
 ('LOCATION: Rachelshire', 'Briantown'),
 ('SOURCE: lake antonio', 'east andrewport')]

In [7]:
import pandas as pd
res = pd.DataFrame(columns=['input_entity', 'output_entity'])

In [8]:
res['input_entity'] = input
res['output_entity'] = output

In [9]:
res.to_csv('cities_data.csv', index=False)

In [10]:
len(res)

2700