# Python Code for Data Cleaning 
**Final Project / Introduction to Econometrics**
> Benson Chiu @ NTU IM / B10705047

In [None]:
import pandas as pd

#### Read the list of our needed stations
- We collect a list of 104 YouBike stations inside and nearby NTU from https://www.youbike.com.tw/region/main/stations/list/, and put it into `ntu_station_info.txt`.
- Read `ntu_station_info.txt` and put the *name of each station* and *capacity of each station* into the dictionary `data`.
  

In [None]:
f = open('ntu_station_info.txt', encoding = "UTF-8")
text = []

for line in f.readlines():
    text.append(line[:-1])

data = {}
for t in range(0, len(text), 3):
    name = text[t]
    capacity = int(text[t + 1]) + int(text[t + 2])
    data[name] = capacity


- Convert the dictionary `data` into a dataframe `df`.
- Put the name of all stations into the list `name_list`.

In [None]:
df = pd.DataFrame(list(data.items()), columns= ["Name", "Capacity"])
name_list = list(df["Name"])

#### Subset the original dataset
- We get the renting and returning records of YouBike 2.0 in Taipei City from 2022/09 to 2022/12 from Taipei City Government (https://data.gov.tw/dataset/150635).
- We read the `.csv`  and convert it into a dataframe respectively and then concatenate them into `df_all`.

In [None]:
#Read the csv files (111-1)
df_2022_09 = pd.read_csv('./source_data/2022_09.csv')
df_2022_10 = pd.read_csv('./source_data/2022_10.csv')
df_2022_11 = pd.read_csv('./source_data/2022_11.csv')
df_2022_12 = pd.read_csv('./source_data/2022_12.csv')

df_all = pd.concat([df_2022_09, df_2022_10, df_2022_11, df_2022_12], ignore_index=True)


- We subset the `df_all` by the condition **both rent station and return station must be in our list of 104 stations inside or nearby NTU**.
- We also save the resulting dataframe `df_ntu` to a `.csv` file called  `ntu_station_data.csv`.

In [None]:
df_ntu = df_all[df_all['rent_station'].isin(name_list) & df_all['return_station'].isin(name_list)]
df_ntu.to_csv('./final_data/ntu_station_data.csv', index = False)

#### Acquire all of the locations of the 104 stations.
- We get all geographical locations (latitude, longitude) for every station in our list.
- We use **Places API** provided by Google Maps
  - Documentation: https://developers.google.com/maps/documentation/places/web-service
- We use `requests` to fetch the result (in `json` format), we store them into res_json and then put the required elements into the dictionary `locations`.

In [None]:
api_key = ""

In [None]:
#Get the geographical locations of the stations in name_list by Google API
import requests
locations = {}
for name in name_list:
    url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=YouBike微笑單車 2.0: "+ name + "&inputtype=textquery&fields=geometry&key="+ api_key
    payload= {}
    headers = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    res_json = response.json()
    locations[name] = [res_json["candidates"][0]["geometry"]["location"]['lat'], res_json["candidates"][0]["geometry"]["location"]['lng']]

print(locations)


- In the `for` loop:
  - We prepare the dictionary `dict_to_df` which has a more convenient format for `pandas`.
  - Also, we prepare the string with all of the location (in url format).
- We create a dataframe `df_locations` to store the locations of all 104 stations, also, we export the dataframe to a `.csv` file and we call it `ntu_station_locations.csv`.
  

In [None]:

dict_to_df = {"Name": [], "Latitude":[], "Longitude":[]}
str_of_locs = ""
for loc_k, loc_v in locations.items():
    dict_to_df["Name"].append(loc_k)
    dict_to_df["Latitude"].append(loc_v[0])
    str_of_locs += (str(loc_v[0]) + "%2C")
    dict_to_df["Longitude"].append(loc_v[1])
    str_of_locs += (str(loc_v[1]) + "%7C")

str_of_locs = str_of_locs[:-3]

df_locations = pd.DataFrame(dict_to_df)
print(df_locations.head())
df_locations.to_csv('./final_data/ntu_station_locations.csv', index = False)

#### Acquire the distance and duration between each pair of station.
- We use **Distance Matrix API** provided by Google Maps
  - Documentation: https://developers.google.com/maps/documentation/distance-matrix
- We create a dataframe `df_distances` to store the distance and duration between each pair of station. Also, we export the dataframe to a `.csv` file and we call it `ntu_station_distances.csv`.

In [None]:
#Get the distance matrix
import requests
res = {"From":[], "To":[], "Distance (meters)":[], "Duration (seconds)":[]}
list_of_locs = str_of_locs.split('%7C')
for o_i, origin in enumerate(list_of_locs):
    for d_i, dist in enumerate(list_of_locs):
        if origin != dist:
            url = "https://maps.googleapis.com/maps/api/distancematrix/json?origins="+ origin +"&destinations="+ dist  +"&mode=bicycling&region=tw&key=" + api_key
            payload={}
            headers = {}
            res_2 = requests.request("GET", url, headers=headers, data=payload).json()
            distance  = res_2["rows"][0]["elements"][0]["distance"]["value"]
            duration  = res_2["rows"][0]["elements"][0]["duration"]["value"]
            res["From"].append(name_list[o_i])
            res["To"].append(name_list[d_i])
            res["Distance (meters)"].append(distance)
            res["Duration (seconds)"].append(duration)
df_distances = pd.DataFrame(res)
df_distances

In [None]:
df_distances.to_csv('./final_data/ntu_station_distances.csv', index = False)

#### Add capacities to `ntu_station_locations.csv`

In [None]:
df_locations = pd.read_csv('./final_data/ntu_station_locations_merged.csv')
df_locations = df_locations[["Name", "Latitude", "Longitude", "Capacity_x"]] 
df_locations.rename(columns= {"Capacity_x":"Capacity"}, inplace=True)
df_locations.to_csv('./final_data/ntu_station_locations.csv')