# Exploratory data analysis (EDA) of Indonesia Tourism Destination

We are trying to explore the indonesia tourism dataset contains nearly ~400 tourism destination


In [46]:
import pandas as pd

### Tourist attractions

In [47]:
tourism_with_id = pd.read_csv('./data/tourism_with_id.csv')

In [48]:
tourism_with_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Description   437 non-null    object 
 3   Category      437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    float64
 7   Time_Minutes  205 non-null    float64
 8   Coordinate    437 non-null    object 
 9   Lat           437 non-null    float64
 10  Long          437 non-null    float64
 11  Unnamed: 11   0 non-null      float64
 12  Unnamed: 12   437 non-null    int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 44.5+ KB


In [49]:
tourism_with_id.describe()

Unnamed: 0,Place_Id,Price,Rating,Time_Minutes,Lat,Long,Unnamed: 11,Unnamed: 12
count,437.0,437.0,437.0,205.0,437.0,437.0,0.0,437.0
mean,219.0,24652.173913,4.442792,82.609756,-7.095438,109.160142,,219.0
std,126.295289,66446.374709,0.208587,52.872339,0.727241,1.962848,,126.295289
min,1.0,0.0,3.4,10.0,-8.197894,103.931398,,1.0
25%,110.0,0.0,4.3,45.0,-7.74959,107.578369,,110.0
50%,219.0,5000.0,4.5,60.0,-7.020524,110.237468,,219.0
75%,328.0,20000.0,4.6,120.0,-6.829411,110.431869,,328.0
max,437.0,900000.0,5.0,360.0,1.07888,112.821662,,437.0


In [50]:
tourism_with_id.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [None]:
# The index and Unnamed: 12 contains same data
len(tourism_with_id[tourism_with_id['Unnamed: 12']==tourism_with_id['Place_Id']])


437

Note: 
- `Unnamed 11` is full of NaN so we can drop
- `Unnamed 12` is a copy of `Place_ID` so we can drop this

In [51]:
modified_tourism_with_id = tourism_with_id.drop(['Unnamed: 11', 'Unnamed: 12'], axis=1)

In [52]:
def find_outliers(df, column):
    """Find outliers in a column"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

In [57]:
outliers = find_outliers(tourism_with_id,'Time_Minutes')
outlier_time = outliers['Time_Minutes'].unique()
outliers_category = outliers['Category'].unique()

print(f'Outliers in Time_Minutes: {outlier_time}')
print(f'Time Outliers categories: {outliers_category}')


Outliers in Time_Minutes: [360. 300. 240.]
Time Outliers categories: ['Taman Hiburan' 'Budaya']


Here all this outliers are valid points as the categories are Taman Hiburan and Budaya which means Amusement Park and Culture exploration

Also `Time_Minutes` has `232` missing values. Instead of filling the values with mean/median. In this case based on category & city we can fill the missing values using `KNN` or `Decistion tree` based imputation

### User

In [8]:
user = pd.read_csv('./data/user.csv')


In [9]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   User_Id   300 non-null    int64 
 1   Location  300 non-null    object
 2   Age       300 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.2+ KB


In [10]:
user.describe()

Unnamed: 0,User_Id,Age
count,300.0,300.0
mean,150.5,28.7
std,86.746758,6.393716
min,1.0,18.0
25%,75.75,24.0
50%,150.5,29.0
75%,225.25,34.0
max,300.0,40.0


In [11]:
user.head()

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20
1,2,"Bekasi, Jawa Barat",21
2,3,"Cirebon, Jawa Barat",23
3,4,"Bekasi, Jawa Barat",21
4,5,"Lampung, Sumatera Selatan",20


In [67]:
user_age_outlier = find_outliers(user, 'Age')
age_outliers = len(user_age_outlier['Age'])

print(f'Age outlier count: {age_outliers}')

Age outlier count: 0


This dataset has no issues as it is straight forward

### Tourism Ratings

In [12]:
tourism_rating = pd.read_csv('./data/tourism_rating.csv')

In [13]:
tourism_rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [14]:
tourism_rating.describe()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
count,10000.0,10000.0,10000.0
mean,151.2927,219.4164,3.0665
std,86.137374,126.228335,1.379952
min,1.0,1.0,1.0
25%,77.0,108.75,2.0
50%,151.0,220.0,3.0
75%,226.0,329.0,4.0
max,300.0,437.0,5.0


In [16]:
tourism_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   User_Id        10000 non-null  int64
 1   Place_Id       10000 non-null  int64
 2   Place_Ratings  10000 non-null  int64
dtypes: int64(3)
memory usage: 234.5 KB


In [68]:
min_rating, max_rating = tourism_rating['Place_Ratings'].min(), tourism_rating['Place_Ratings'].max()
min_place_id, max_place_id = tourism_rating['Place_Id'].min(), tourism_rating['Place_Id'].max()

print(f"Place ID Min: {min_place_id}, Max: {max_place_id}")
print(f"Rating Min: {min_rating}, Max: {max_rating}")

Place ID Min: 1, Max: 437
Rating Min: 1, Max: 5


All the values in the dataset is in the defined range

### Tourism package

In [17]:
package_tourism = pd.read_csv("./data/package_tourism.csv")

In [20]:
package_tourism.head()

Unnamed: 0,Package,City,Place_Tourism1,Place_Tourism2,Place_Tourism3,Place_Tourism4,Place_Tourism5
0,1,Jakarta,Pasar Tanah Abang,Taman Ayodya,Museum Tekstil,,
1,2,Jakarta,Pasar Tanah Abang,Pasar Taman Puring,Pasar Petak Sembilan,,
2,3,Jakarta,Perpustakaan Nasional,Monas,Masjid Istiqlal,,
3,4,Jakarta,Pulau Tidung,Pulau Bidadari,Pulau Pari,Pulau Pramuka,Pulau Pelangi
4,5,Jakarta,Museum Satria Mandala,Museum Wayang,Museum Bahari Jakarta,Museum Macan (Modern and Contemporary Art in N...,


In [18]:
package_tourism.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Package         100 non-null    int64 
 1   City            100 non-null    object
 2   Place_Tourism1  100 non-null    object
 3   Place_Tourism2  100 non-null    object
 4   Place_Tourism3  100 non-null    object
 5   Place_Tourism4  66 non-null     object
 6   Place_Tourism5  39 non-null     object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB


In [19]:
package_tourism.describe()

Unnamed: 0,Package
count,100.0
mean,50.5
std,29.011492
min,1.0
25%,25.75
50%,50.5
75%,75.25
max,100.0


In this dataset `Place_Tourism4` and `Place_Tourism4` are NaN which is valid as the some packers has 3 / 4 / 5 places. We need to consider that NaN ans valid empty value in this case.