In [1]:
import pandas as pd
from math import sin, cos, asin, radians, sqrt
import numpy as np

## 1. Dataset

In [2]:
cities_df = pd.read_csv("../raw_data/konum_il.csv", usecols=["plaka", "il_adi", "lat", "lon"])
districts_df = pd.read_csv("../raw_data/konum_ilce.csv", usecols=["il_plaka", "ilce_adi", "lat", "lon"])

cities_df.rename(columns = {'plaka':'city_code', 'il_adi':'name'}, inplace = True)
districts_df.rename(columns = {'il_plaka': 'city_code', 'ilce_adi':'name'}, inplace = True)

cities_df

Unnamed: 0,city_code,name,lat,lon
0,1,ADANA,36.991400,35.330800
1,2,ADIYAMAN,37.764167,38.276167
2,3,AFYONKARAHİSAR,38.763760,30.540340
3,4,AĞRI,39.721667,43.056667
4,5,AMASYA,40.650000,35.833333
...,...,...,...,...
76,77,YALOVA,40.650000,29.266667
77,78,KARABÜK,41.200000,32.633333
78,79,KİLİS,36.718399,37.121220
79,80,OSMANİYE,37.068050,36.261589


In [3]:
districts_df

Unnamed: 0,city_code,name,lat,lon
0,1,ALADAĞ(KARSANTI),37.546379,35.402962
1,1,CEYHAN,37.031700,35.822750
2,1,ÇUKUROVA,37.040000,35.321333
3,1,FEKE,37.819918,35.272100
4,1,İMAMOĞLU,37.258751,35.672840
...,...,...,...,...
948,81,DÜZCE,40.843849,31.156540
949,81,GÖLYAKA,40.776579,30.995727
950,81,GÜMÜŞOVA,40.850000,30.933333
951,81,KAYNAŞLI,40.777337,31.303267


## 2. Preprocessing

### 2.1 Change names so that only the first letter is capitalized.

In [4]:
def my_capitalize(text):
    lower_map = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    }

    if text == "19 MAYIS":
        return "19 Mayıs"

    return text[0] + text[1:].translate(lower_map).lower()

# Capitalize city and district names
cities_df['name'] = cities_df['name'].apply(lambda x: my_capitalize(x))
districts_df['name'] = districts_df['name'].str.replace(r"\s*\([^()]*\)", "").apply(lambda x: my_capitalize(x))

  districts_df['name'] = districts_df['name'].str.replace(r"\s*\([^()]*\)", "").apply(lambda x: my_capitalize(x))


### 2.2 Change districts name (i.e {city_name}_{district_name})

In [5]:
# Remove city centers from districts_df and clean substrings from strings between brackets
districts_df = districts_df.loc[~districts_df['name'].isin(cities_df['name'])].reset_index(drop=True)

temp_list = []
for i, row in districts_df.iterrows():
    city_name = cities_df.loc[cities_df["city_code"] == row["city_code"]].name.values[0]
    district_name = districts_df.at[i, "name"]
    temp_list.append(f"{city_name}_{district_name}")
districts_df["name"] = temp_list

districts_df


Unnamed: 0,city_code,name,lat,lon
0,1,Adana_Aladağ,37.546379,35.402962
1,1,Adana_Ceyhan,37.031700,35.822750
2,1,Adana_Çukurova,37.040000,35.321333
3,1,Adana_Feke,37.819918,35.272100
4,1,Adana_İmamoğlu,37.258751,35.672840
...,...,...,...,...
885,81,Düzce_Çilimli,40.896897,31.047014
886,81,Düzce_Gölyaka,40.776579,30.995727
887,81,Düzce_Gümüşova,40.850000,30.933333
888,81,Düzce_Kaynaşlı,40.777337,31.303267


### 2.3 Add population

#### 2.3.1 For districts

In [6]:
district_populations = pd.read_csv("../raw_data/nüfus_ilçe.csv", usecols=["İlçe", "Nüfus"])
district_populations.rename(columns = {'İlçe':'name', 'Nüfus':'population'}, inplace = True)

district_populations = district_populations.loc[~district_populations['name'].str.contains("Merkez")].reset_index(drop=True)

# split the string at the first occurrence of the left parenthesis
district_populations['name'] = district_populations['name'].apply(lambda x: x.split('(', 1)[0] + '_' + x.split('(', 1)[1].split(')')[0] if '(' in x else x)
# print the updated dataframe

districts_df = districts_df.merge(district_populations[['name', 'population']], how='left')
districts_df

Unnamed: 0,city_code,name,lat,lon,population
0,1,Adana_Aladağ,37.546379,35.402962,15897
1,1,Adana_Ceyhan,37.031700,35.822750,158922
2,1,Adana_Çukurova,37.040000,35.321333,389175
3,1,Adana_Feke,37.819918,35.272100,15833
4,1,Adana_İmamoğlu,37.258751,35.672840,27037
...,...,...,...,...,...
885,81,Düzce_Çilimli,40.896897,31.047014,19648
886,81,Düzce_Gölyaka,40.776579,30.995727,20552
887,81,Düzce_Gümüşova,40.850000,30.933333,16844
888,81,Düzce_Kaynaşlı,40.777337,31.303267,20449


#### 2.3.2 For cities

In [7]:
city_populations = pd.read_csv("../raw_data/nüfus_il.csv", usecols=["İl", "Nüfus"], dtype={'Nüfus':'int'})
city_populations.rename(columns = {'İl':'name', 'Nüfus':'population'}, inplace = True)

# split the string at the first occurrence of the left parenthesis
city_populations['name'] = city_populations['name'].apply(lambda x: x.split('-',1)[0])
# print the updated dataframe

cities_df = cities_df.merge(city_populations[['name', 'population']], how='left')
cities_df


Unnamed: 0,city_code,name,lat,lon,population
0,1,Adana,36.991400,35.330800,2274106
1,2,Adıyaman,37.764167,38.276167,635169
2,3,Afyonkarahisar,38.763760,30.540340,747555
3,4,Ağrı,39.721667,43.056667,510626
4,5,Amasya,40.650000,35.833333,338267
...,...,...,...,...,...
76,77,Yalova,40.650000,29.266667,296333
77,78,Karabük,41.200000,32.633333,252058
78,79,Kilis,36.718399,37.121220,147919
79,80,Osmaniye,37.068050,36.261589,559405


### 2.4 Merge cities and districts

In [8]:
dataset_df = pd.concat([cities_df, districts_df]).reset_index(drop=True)
dataset_df

Unnamed: 0,city_code,name,lat,lon,population
0,1,Adana,36.991400,35.330800,2274106
1,2,Adıyaman,37.764167,38.276167,635169
2,3,Afyonkarahisar,38.763760,30.540340,747555
3,4,Ağrı,39.721667,43.056667,510626
4,5,Amasya,40.650000,35.833333,338267
...,...,...,...,...,...
966,81,Düzce_Çilimli,40.896897,31.047014,19648
967,81,Düzce_Gölyaka,40.776579,30.995727,20552
968,81,Düzce_Gümüşova,40.850000,30.933333,16844
969,81,Düzce_Kaynaşlı,40.777337,31.303267,20449


### 2.5 Add Region

In [9]:
region_dict = {
    "TR10_İstanbul_alt_bölgesi" : ["İstanbul"],
    "TR21_Tekirdağ_alt_bölgesi" : ["Tekirdağ", "Edirne", "Kırklareli"],
    "TR22_Balıkesir_alt_bölgesi" : ["Balıkesir", "Çanakkale"],
    "TR31_İzmir_alt_bölgesi" : ["İzmir"],
    "TR32_Aydın_alt_bölgesi" : ["Aydın", "Denizli", "Muğla"],
    "TR33_Manisa_alt_bölgesi" : ["Manisa", "Afyonkarahisar", "Kütahya", "Uşak"],
    "TR41_Bursa_alt_bölgesi" : ["Bursa", "Eskişehir", "Bilecik"],
    "TR42_Kocaeli_alt_bölgesi" : ["Kocaeli", "Sakarya","Düzce", "Bolu","Yalova"],
    "TR51_Ankara_alt_bölgesi" : ["Ankara"],
    "TR52_Konya_alt_bölgesi" : ["Konya", "Karaman"],
    "TR61_Antalya_alt_bölgesi" : ["Antalya","Isparta","Burdur"],
    "TR62_Adana_alt_bölgesi" : ["Adana", "Mersin"],
    "TR63_Hatay_alt_bölgesi" : ["Hatay","Kahramanmaraş","Osmaniye"],
    "TR71_Kırıkkale_alt_bölgesi" : ["Kırıkkale","Aksaray","Niğde","Nevşehir","Kırşehir"],
    "TR72_Kayseri_alt_bölgesi" : ["Kayseri","Sivas","Yozgat"],
    "TR81_Zonguldak_alt_bölgesi" : ["Zonguldak","Karabük","Bartın"],
    "TR82_Kastamonu_alt_bölgesi" : ["Kastamonu","Çankırı","Sinop"],
    "TR83_Samsun_alt_bölgesi" : ["Samsun","Tokat","Çorum","Amasya"],
    "TR90_Trabzon_alt_bölgesi" : ["Trabzon","Ordu","Giresun","Rize","Artvin","Gümüşhane"],
    "TRA1_Erzurum_alt_bölgesi" : ["Erzurum","Erzincan","Bayburt"],
    "TRA2_Ağrı_alt_bölgesi" : ["Ağrı","Kars","Iğdır","Ardahan"],
    "TRB1_Malatya_alt_bölgesi" : ["Malatya","Elazığ","Bingöl","Tunceli"],
    "TRB2_Van_alt_bölgesi" : ["Van","Muş","Bitlis","Hakkari"],
    "TRC1_Gaziantep_alt_bölgesi" : ["Gaziantep", "Adıyaman", "Kilis"],
    "TRC2_Şanlıurfa_alt_bölgesi" : ["Şanlıurfa","Diyarbakır"],
    "TRC3_Mardin_alt_bölgesi" : ["Mardin","Batman","Şırnak","Siirt"]
}

dataset_df["region"] = [next((key for key, values in region_dict.items() if any(value in name for value in values)), None) for name in dataset_df["name"]]

### 2.6 Delete districts near cities

In [10]:
city_centers = {}
for index, row in dataset_df.iterrows():
    if row['name'].split("_")[0] not in city_centers.keys():
        city_centers[row['name']] = ((row['lat'], row['lon']))

if "area" not in dataset_df.columns:
    city_areas = pd.read_csv("../raw_data/illerin_yüzölçümleri.csv").drop(columns=["Sıra"])
    city_areas.columns = ["name", "area"]
    dataset_df = pd.merge(dataset_df, city_areas, on="name", how="left")

radius_values = np.sqrt(dataset_df["area"]) / 4
radius_km = dict(zip(dataset_df["name"], radius_values))
selected_rows = []
for city, (lat, lon) in city_centers.items():
    radius = radius_km[city]
    city_code = dataset_df[dataset_df['name'] == city].city_code.values[0]
    for i, row in dataset_df.loc[dataset_df['city_code'] == city_code].iterrows():
        haversine = lambda lat1, lon1, lat2, lon2: 2 * asin(sqrt(sin((lat2 - lat1) / 2) ** 2 + cos(lat1) * cos(lat2) * sin((lon2 - lon1) / 2) ** 2))
        distance = 6371 * haversine(radians(lat), radians(lon), radians(row['lat']), radians(row['lon']))
        if distance > radius or row["name"] == city:
            selected_rows.append(i)
            
dataset_df = dataset_df.iloc[selected_rows].reset_index(drop=True)

### 2.7 Add demand

In [11]:
# Fraction of citizens of a city that may order a product
FRACTION_DEMAND = 0.02

# Demand is composed of:
#   1. A fraction of the population
#   2. An error term of uniform distribution
# Note: demand is approximated to the closest int as its physical meaning denies decimals
dataset_df['demand'] = np.floor(FRACTION_DEMAND * dataset_df.population + np.random.uniform(-10, 10, size=(dataset_df.shape[0],)))

### 2.8 Reorder columns

In [12]:
dataset_df = dataset_df[["city_code","name","region","lat","lon","population","demand"]]

### 3. Export CSV

In [13]:
dataset_df.to_csv("../dataset/dataset.csv", index=False)
dataset_df

Unnamed: 0,city_code,name,region,lat,lon,population,demand
0,1,Adana,TR62_Adana_alt_bölgesi,36.991400,35.330800,2274106,45480.0
1,1,Adana_Aladağ,TR62_Adana_alt_bölgesi,37.546379,35.402962,15897,309.0
2,1,Adana_Ceyhan,TR62_Adana_alt_bölgesi,37.031700,35.822750,158922,3178.0
3,1,Adana_Feke,TR62_Adana_alt_bölgesi,37.819918,35.272100,15833,311.0
4,1,Adana_İmamoğlu,TR62_Adana_alt_bölgesi,37.258751,35.672840,27037,533.0
...,...,...,...,...,...,...,...
764,81,Düzce_Cumayeri,TR42_Kocaeli_alt_bölgesi,40.873871,30.949057,15214,300.0
765,81,Düzce_Gölyaka,TR42_Kocaeli_alt_bölgesi,40.776579,30.995727,20552,410.0
766,81,Düzce_Gümüşova,TR42_Kocaeli_alt_bölgesi,40.850000,30.933333,16844,336.0
767,81,Düzce_Kaynaşlı,TR42_Kocaeli_alt_bölgesi,40.777337,31.303267,20449,413.0
