## I. Concatenate files into a single one

Our file "data" has many single csv file. We need to convert into only one

First of all, mount the drive as usual

In [None]:
!pip install geopandas

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/Data Analyst/Gody/data

/content/drive/My Drive/Data Analyst/Gody/data


In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import math

files = os.listdir()

In [4]:
# Concatenate many single files into one
data = pd.read_csv(files[0])
for file in files[1:]:
    other_data = pd.read_csv(file)
    data = pd.concat([data, other_data], ignore_index=True)

In [5]:
data.shape

(2728, 5)

## II. Clean data

In [6]:
data.head()

Unnamed: 0,Province,Tourists,Destination,Rate,Visitors
0,HÀ NỘI,143397,Cầu Long Biên (Long Bien bridge),4.6,1891
1,HÀ NỘI,143397,Phố bia Tạ Hiện,4.5,1385
2,HÀ NỘI,143397,Vườn quốc gia Ba Vì (Ba Vi National Park),4.5,867
3,HÀ NỘI,143397,Tháp Rùa (Thap Rua Tower),4.4,800
4,HÀ NỘI,143397,Cầu Thê Húc,4.7,765


In [8]:
import geopandas as gpd
map_df = gpd.read_file('./diaphantinh.geojson')
map_df.head()

Unnamed: 0,gid,code,ten_tinh,geometry
0,1,AD01,An Giang,"MULTIPOLYGON (((105.11524 10.95566, 105.11463 ..."
1,2,AD01,Bà Rịa -Vũng Tàu,"MULTIPOLYGON (((106.08110 8.57754, 106.08069 8..."
2,3,AD01,Bắc Giang,"MULTIPOLYGON (((106.16542 21.62022, 106.16925 ..."
3,4,AD01,Bắc Kạn,"MULTIPOLYGON (((105.74420 22.73519, 105.74624 ..."
4,5,AD01,Bạc Liêu,"MULTIPOLYGON (((105.32591 9.60004, 105.32755 9..."


In [24]:
# Fix some typo errors
map_df.loc[44, 'ten_tinh'] = 'Quảng Bình'
map_df.loc[31, 'ten_tinh'] = 'Kiên Giang'
map_df.loc[12, 'ten_tinh'] = 'Cần Thơ'
map_df.loc[1, 'ten_tinh'] = 'Bà Rịa - Vũng Tàu'

In [14]:
# Lowercase the name of province
def lowercase(s):
    res = ''
    for i in range(len(s)):
        if i == 0 or s[i-1] == ' ':
            res += s[i]
        else:
            res += s[i].lower()
    return res

data['Province'] = data['Province'].apply(lambda x: lowercase(x))

In [22]:
# Fix a bit more
fixed_dict = {'Hồ Chí Minh': 'TP. Hồ Chí Minh', 'Quảng Bình': 'Quảng Bình', 'Đắk Lắk': 'Đăk Lăk', 'Đắk Nông': 'Đăk Nông'}
def fix(x):
    if x in fixed_dict:
        return fixed_dict[x]
    return x

data['Province'] = data['Province'].apply(lambda x: fix(x))

In [29]:
# Convert the tourists columns to int
def convert_tourists(s):
    res = ''
    for c in s:
        if c != ',':
            res += c
    return int(res)

data['Tourists'] = data['Tourists'].apply(lambda x: convert_tourists(x))

In [34]:
data.head()

Unnamed: 0,Province,Tourists,Destination,Rate,Visitors
0,Hà Nội,143397,Cầu Long Biên,4.6,1891
1,Hà Nội,143397,Phố bia Tạ Hiện,4.5,1385
2,Hà Nội,143397,Vườn quốc gia Ba Vì,4.5,867
3,Hà Nội,143397,Tháp Rùa,4.4,800
4,Hà Nội,143397,Cầu Thê Húc,4.7,765


In [33]:
# Clean the Destination column 
def clean_destination(s):
    if '(' in s:
        idx = s.index('(')
        s = s[:idx - 1]
    return s

data['Destination'] = data['Destination'].apply(lambda x: clean_destination(x))

In [35]:
# Convert rate column to float
data['Rate'] = data['Rate'].astype(float)

In [38]:
data.to_csv('./cleaned_data', index=False)