# Step By Step Pengerjaan (Deadline 3 September 2024)
- Cocokin data byPlaceData.csv dengan scrapetable_wisata.xlsx
- Mengubah konten menjadi bahasa indonesia, dengan mengambil data dari byPlaceData.csv
- Data yang tidak ada di scrapetable_wisata.xlsx langsung di drop saja di byPlaceData.csv (mengurangi storage)

# Data Gathering

In [250]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import re

DATA = 'data/'

dataBaruPath = f'{DATA}scrapetable_wisata.xlsx'
dataByPlace = f'{DATA}byPlaceData.csv'
dataNewFix = f'{DATA}new_fixedData.xlsx'

dfNew = pd.read_excel(dataBaruPath)
dfByPlace = pd.read_csv(dataByPlace)
dfNewFix = pd.read_excel(dataNewFix)

In [None]:
dfNew.shape[0]

In [None]:
dfNew.head()

In [None]:
dfByPlace.head()

# Data Assessing

## Cek Info DF

In [None]:
# ByPlace
print('By Place')
print(dfByPlace.info())

print()

# new
print('New')
print(dfNew.info())

print()

# newfixed
print('New Fixed')
print(dfNewFix.info())

## Cek Nilai Null

In [None]:
# ByPlace
print('By Place')
print(dfByPlace.isna().sum())

print()

# new
print('New')
print(dfNew.isna().sum())

print()

# newfixed
print('New Fixed')
print(dfNewFix.isna().sum())

## Cek Nilai Duplikat

In [None]:
# ByPlace
print('By Place')
print(dfByPlace.duplicated().sum())

print()

# new
print('New')
print(dfNew.duplicated().sum())

print()

# newfixed
print('New Fixed')
print(dfNewFix.duplicated().sum())

## Pemilihan Kolom Data Bu Melany

In [None]:
dfNew.head(10)

In [None]:
print(dfNew.latitude.sample(1))
print(dfNew.longitude.sample(1))

print(dfByPlace.coordinates.sample(1))
print(dfByPlace.address.sample(1).values)

In [None]:
print(dfNew.timezone.unique())
print(dfNew.review_count.sample(1))
print(dfNew.place_id.sample(1))
print(dfNew.city.unique())
print(dfNew['validasi di jogja dan tetangga'].unique())

# Data Cleaning

In [None]:
# Dataset 1: Menghapus kolom dengan banyak missing values
dfByPlace_cleaned = dfByPlace.drop(columns=['is_rating_updated', 'is_reviews_updated'])

# Dataset 2: Menghapus kolom dengan banyak missing values dan mengisi missing values lainnya
dfNew_cleaned = dfNew.drop(columns=['price_level'])

# Mengisi missing values pada kolom numerik dengan median
dfNew_cleaned['phone_number'].fillna('Unknown', inplace=True)
# dfNew_cleaned['review_count'].fillna(dfNew_cleaned['review_count'].median(), inplace=True) (Dipertimbangkan)
dfNew_cleaned['rating'].fillna(dfNew_cleaned['rating'].median(), inplace=True)

# Mengisi missing values pada kolom string dengan 'Unknown'
dfNew_cleaned['website'].fillna('Unknown', inplace=True)
columns_to_fill = ['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'state']
for column in columns_to_fill:
    dfNew_cleaned[column].fillna('Unknown', inplace=True)

# Memeriksa kembali missing values setelah pembersihan
missing_values_dfByPlace_cleaned = dfByPlace_cleaned.isnull().sum()
missing_values_dfNew_cleaned = dfNew_cleaned.isnull().sum()

missing_values_dfByPlace_cleaned, missing_values_dfNew_cleaned

## Menggabungkan Data dan Filtering

In [None]:
# Menggabungkan data dari kedua dataset berdasarkan kolom address
# Menentukan keyword untuk filtering
keywords = ["Jawa Tengah", "Central Java", "Yogyakarta", "Special Region of Yogyakarta"]

# Filter Dataset 1
filtered_dfByPlace = dfByPlace_cleaned[dfByPlace_cleaned['address'].str.contains('|'.join(keywords), case=False, na=False)]

# Filter Dataset 2
filtered_dfNew = dfNew_cleaned[dfNew_cleaned['full_address'].str.contains('|'.join(keywords), case=False, na=False)]

# Menggabungkan kedua dataset yang telah difilter
combined_dataset = pd.concat([filtered_dfByPlace, filtered_dfNew], ignore_index=True)

# Menampilkan jumlah baris dan kolom dari dataset yang digabungkan
combined_shape = combined_dataset.shape
combined_shape

## Filtering Data

In [None]:
# Convert the names in both datasets to lowercase for better matching
# dfByPlace['name'] = dfByPlace['name'].str.lower()
# dfNew['name'] = dfNew['name']

# Filter out rows in byPlaceData that do not have a matching name in scrapetable_wisata
filtered_data = dfByPlace[(dfByPlace['name'].str.lower()).isin((dfNew['name']).str.lower())]

# Now filter based on the address containing 'Jawa Tengah', 'Central Java', or 'Yogyakarta'
filtered_data = filtered_data[
    filtered_data['address'].str.contains('Jawa Tengah|Central Java|Yogyakarta', case=False)
]

# Display the first few rows of the filtered data to verify
filtered_data.head()

In [None]:
# # Save the filtered data to a new CSV file
# filtered_data_path = 'data/filtered_byPlaceData.csv'
# filtered_data.to_csv(filtered_data_path, index=False)

filtered_data.head()

## Ambil Data dari DF Baru
Pengambilan data yang mungkin masih merupakan wisata dari data baru, tapi tidak ada di data utama

In [264]:
unpickedData = dfNew[~dfNew['name'].str.lower().isin(filtered_data['name'].str.lower())]

filteredCols = filtered_data.columns
unpickedCols = unpickedData.columns

### Get Data Fix

Data yang disini sudah hasil pengambilan dari data baru, yang tidak ada di data lama

In [None]:
fixedDf = pd.read_csv('data/fixed_data.csv')
fixedDf.info()

In [None]:
fixedDf.shape[0]

In [None]:
filtered_data.shape[0]

### Cek Nilai Null

In [None]:
fixedDf.isna().sum()

## Feature Selection

### Memindahkan Nilai dari Kolom ke Kolom

In [269]:
# Fungsi drop kolom
def dropCol(df, lsCol):
    col1 = lsCol[0]
    col2 = lsCol[1]
    
    if col1 == 'latitude' and col2 == 'longitude':
        df.drop(columns=[col1, col2], inplace=True)
    else:
        df.drop(columns=[col2], inplace=True)
    
    return df

# Fungsi Pemindah Nilai
def moveValues(col1, col2, df):
    # Kondisi coordinates
    if col1 == 'latitude' and col2 == 'longitude':
        # Menggabungkan koordinat dan menyimpannya pada kolom 'coordinates'
        df['coordinates'] = df.apply(lambda row: f"{row[col1]},{row[col2]}" if pd.isna(row['coordinates']) else row['coordinates'], axis=1)
        
        # Menghapus kolom latitude dan longitude
        df = dropCol(df, [col1, col2])
    else:
        # Mengecek baris dengan NaN pada col1
        mask_nullCol1 = df[col1].isna()
        
        if mask_nullCol1.any():
            df.loc[mask_nullCol1, col1] = df.loc[mask_nullCol1, col2]
        else:
            print('Data tidak ada isinya')
        
        # Menghapus kolom col1 dan col2
        df = dropCol(df, [col1, col2])
    
    return df

In [270]:
# Kolom Address dan Full Address
fixedDf = moveValues('address', 'full_address', fixedDf)
# Kolom latitude dan longitude
fixedDf = moveValues('latitude', 'longitude', fixedDf)
# Kolom latitude dan longitude
fixedDf = moveValues('reviews', 'review_count', fixedDf)

### Hapus Kolom Sisa yang Tidak Digunakan

In [271]:
delCols = ['is_rating_updated', 'is_reviews_updated', 'website', 
           'business_id', 'phone_number', 'Unnamed: 0.1', 'Unnamed: 0', 'price_level']

fixedDf.drop(columns=delCols, inplace=True)

In [None]:
fixedDf.info()

## Menangani Nilai Null

1. timezone (Faishal)
   
Diisi dengan nilai unique, yaitu Asia/Jakarta

In [None]:
fixedDf['timezone'] = fixedDf.timezone.fillna('Asia/Jakarta', axis=0)
print('Nilai unique: ', fixedDf.timezone.unique())

2. types (Akhdan)

Diisi sesuai dengan bidangnya, dilihat dari nilai unique

In [None]:
print('Nilai unique: ', fixedDf.types.unique())
# fixedDf.timezone.fillna('Asia/Jakarta', axis=0, inplace=True)

3. city (Faishal)
   
Diisi dengan menggunakan regex dari full address

In [275]:
# Fungsi untuk mengambil dua bagian terakhir setelah dipisahkan oleh koma
def extractCity(df):
    for i, row in df.iterrows():
        address = row['address']
        
        # Periksa apakah kolom 'city' kosong (NaN)
        if pd.isna(row['city']):
            parts = address.split(',')
            # Menggabungkan tiga bagian terakhir
            df.at[i, 'city'] = ','.join(parts[-2:]).strip()
    return df

fixedDf = extractCity(fixedDf)

# Karena removeNumber bekerja dengan Pandas Series, kita perlu menerapkannya pada keseluruhan kolom
fixedDf['city'] = fixedDf['city'].apply(lambda x: pd.Series([x]).str.replace(r'\d+', '', regex=True).item().strip())

4. workday_timing dan closed_on (Faishal)

Ambil dari yang hari-hari

In [276]:
# Fungsi untuk mencari dan mengubah format waktu AM/PM yang tidak standar ke format 24 jam (contoh: '7:30 AM–6 PM' menjadi '07.30-18.00')
def normalize_am_pm_format(time_str):
    try:
        # Deteksi format dengan AM/PM yang tidak standar, misalnya: '7:30 AM–6 PM'
        time_ranges = re.findall(r'(\d{1,2}:\d{2}|\d{1,2})(?:\s?[AaPp][Mm])?', time_str)
        if time_ranges and 'AM' in time_str or 'PM' in time_str:
            converted_ranges = []
            for i in range(0, len(time_ranges), 2):
                start_time = time_ranges[i]
                end_time = time_ranges[i + 1]
                
                # Konversi waktu awal dan akhir ke format 24 jam
                start_time_24 = datetime.strptime(start_time, '%I:%M' if ':' in start_time else '%I').strftime('%H.%M')
                end_time_24 = datetime.strptime(end_time, '%I:%M' if ':' in end_time else '%I').strftime('%H.%M')
                
                # Gabungkan rentang waktu yang sudah dikonversi
                converted_ranges.append(f"{start_time_24}-{end_time_24}")
            
            return ', '.join(converted_ranges)  # Gabungkan semua rentang waktu yang terdeteksi
    except Exception:
        pass
    
    return time_str  # Jika tidak ada format AM/PM, kembalikan string aslinya

# Regex untuk standarisasi rentang waktu (misalnya: konversi rentang waktu seperti 5.00-18.00 menjadi 05.00-18.00)
def standardize_time_range(time_str):
    # Pattern untuk mendeteksi format waktu (misalnya 5:30-18:00 atau 6.00-23.00)
    pattern = r'(\d{1,2})[:\.](\d{2})-(\d{1,2})[:\.](\d{2})'
    match = re.match(pattern, time_str)
    if match:
        return f"{match.group(1).zfill(2)}.{match.group(2)}-{match.group(3).zfill(2)}.{match.group(4)}"
    return time_str

# Workday Timing
def fillWorkdayTime(df, lsDays):
    for i, row in df.iterrows():
        tempTime = []
        
        for col in lsDays:
            if not pd.isna(row[col]):
                tempTime.append(row[col])
                
        if len(tempTime) > 0:
            # Menghitung frekuensi kemunculan dan mengambil yang paling sering
            most_common_time = Counter(tempTime).most_common(1)[0][0]
            
            # Terapkan normalisasi format AM/PM dan standarisasi waktu
            most_common_time = normalize_am_pm_format(most_common_time)
            most_common_time = standardize_time_range(most_common_time)
            
            # Simpan hasil ke kolom 'workday_timing'
            df.at[i, 'workday_timing'] = most_common_time
        else:
            df.at[i, 'workday_timing'] = 'Not Present'  # Jika semua nilai NaN, beri nilai default
            
    return df

# Closed On
# Fungsi untuk mengisi hari-hari yang "Closed On"
def fillClosedOn(df, lsDays, day_translation):
    # Urutan hari dari Senin sampai Minggu
    order_days = ['Senin', 'Selasa', 'Rabu', 'Kamis', 'Jumat', 'Sabtu', 'Minggu']
    
    for i, row in df.iterrows():
        tempWord = row['closed_on']  # Mulai dengan nilai existing di kolom 'closed_on'
        
        # Jika closed_on sudah ada isinya, lewati baris ini
        if tempWord and not pd.isna(tempWord):
            continue
        
        closed_days = []  # List untuk menyimpan hari-hari yang tutup berdasarkan lsDays
        all_nan = True    # Flag untuk mengecek apakah semua kolom lsDays adalah NaN
        
        # Iterasi melalui hari-hari dalam lsDays
        for col in lsDays:
            if pd.isna(row[col]):
                closed_days.append(day_translation[col])  # Catat hari yang tutup
            else:
                all_nan = False  # Jika ada yang bukan NaN, berarti tidak semua hari tutup

        # Jika semua kolom hari adalah NaN, isi 'Open All Days'
        if all_nan:
            tempWord = 'Open All Days'
        else:
            if closed_days:  # Jika ada hari yang tutup
                tempWord = ', '.join(closed_days)
                # Menghilangkan hari yang berulang dan mengurutkan
                unique_days = sorted(set(tempWord.split(', ')), key=order_days.index)
                tempWord = ', '.join(unique_days)
        
        # Simpan hasil akhir
        df.at[i, 'closed_on'] = tempWord
    
    return df


In [277]:
# List Days
lsDays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday',
          'Thursday', 'Friday', 'Saturday']

# Dictionary untuk menerjemahkan hari dari bahasa Inggris ke bahasa Indonesia
dayTranslation = {
    'Sunday': 'Minggu',
    'Monday': 'Senin',
    'Tuesday': 'Selasa',
    'Wednesday': 'Rabu',
    'Thursday': 'Kamis',
    'Friday': 'Jumat',
    'Saturday': 'Sabtu'
}

# Panggil fungsi fillWorkdayTime
fixedDf = fillWorkdayTime(fixedDf, lsDays=lsDays)

# Panggil fungsi fillClosedOn
fixedDf = fillClosedOn(fixedDf, lsDays, dayTranslation)

In [278]:
# Drop Kolom-kolom Hari
fixedDf = fixedDf.drop(lsDays, axis=1)

5. reviews dan rating (Akhdan)
   
Manual aja ambil dari maps, kalau memungkinkan

In [None]:
# Ambil data dari isian manual
manualDf = pd.read_excel(f'{DATA}new_fixedData.xlsx')

# Merge the datasets on the 'name' column
merged_df = pd.merge(fixedDf, manualDf[['name', 'reviews', 'rating']], on='name', how='left', suffixes=('', '_manual'))

# Update null values in 'reviews' and 'rating' columns in fixeddf with values from manualdf
merged_df['reviews'] = merged_df['reviews'].combine_first(merged_df['reviews_manual'])
merged_df['rating'] = merged_df['rating'].combine_first(merged_df['rating_manual'])

# Drop the manual columns used for the update
fixedDf = merged_df.drop(columns=['reviews_manual', 'rating_manual'])

# fill null with '0'
fixedDf.reviews.fillna(0, inplace=True)
fixedDf.rating.fillna(0, inplace=True)

6. reviewer_name, rating_review, dan review_text (Akhdan)

ambil dari maps juga, masing2 data 1 ajaa kalo yang null

In [280]:
def tfDataReviewer(df1, df2):
    cols = ['reviewer_name', 'rating_review', 'review_text']
    
    # Melakukan merge untuk mendapatkan nilai dari df2 ke df1 berdasarkan kolom 'name'
    merged_df = df1.merge(df2, left_on='name', right_on='name', suffixes=('', '_manual'))
    
    # Mengisi nilai null di df1 dengan nilai dari df2
    for col in cols:
        df1[col] = merged_df[col].combine_first(merged_df[col + '_manual'])
    
    # Menghapus kolom-kolom dengan suffix '_manual'
    cols_to_drop = [col + '_manual' for col in cols]
    df1 = df1.drop(columns=cols_to_drop, errors='ignore')
    
    return df1

fixedDf = tfDataReviewer(fixedDf, manualDf)

7. workday_timing & closed_on

In [281]:
fixedDf['workday_timing'] = fixedDf['workday_timing'].fillna('Not Present', axis=0)
fixedDf['closed_on'] = fixedDf['closed_on'].fillna('Not Present', axis=0)

8. most_popular_times & popular_times

In [282]:
fixedDf['most_popular_times'] = fixedDf['most_popular_times'].fillna('Not Present', axis=0)
fixedDf['popular_times'] = fixedDf['popular_times'].fillna('Not Present', axis=0)

9. validasi di jogja dan tetangga (Faishal)

Langsung isi pake nilai terbanyak aja, keknya udah di jogja dan sekitarnya semua ini

In [None]:
fixedDf['validasi di jogja dan tetangga'] = fixedDf['validasi di jogja dan tetangga'].fillna('YA', axis=0)
fixedDf['validasi di jogja dan tetangga'].unique()

10. accessibility_enabled dan children_enabled

In [None]:
print(fixedDf.accessibility_enabled.unique())
print(fixedDf.children_enabled.unique())

fixedDf['accessibility_enabled'] = fixedDf.accessibility_enabled.fillna('Informasi Tidak Tersedia')
fixedDf['children_enabled'] = fixedDf.children_enabled.fillna('Informasi Tidak Tersedia')

## Menghapus Kolom yang Tidak Diperlukan

In [285]:
fixedDf = fixedDf.drop(['timezone', 'place_id', 'place_link',
                        'verified', 'state', 
                        'validasi di jogja dan tetangga',
                        'published_at_date'], axis=1)

## Translate Ke Bahasa Indonesia

### city

In [None]:
# ganti bahasa indonesia
translationDict = {
    'Bantul Regency, Special Region of Yogyakarta': 'Kabupaten Bantul, Daerah Istimewa Yogyakarta',
    'Bantul Regency, Special Region of Yogyakarta, Indonesia': 'Kabupaten Bantul, Daerah Istimewa Yogyakarta',
    'Yogyakarta City, Special Region of Yogyakarta': 'Kota Yogyakarta, Daerah Istimewa Yogyakarta',
    'Sleman Regency, Special Region of Yogyakarta': 'Kabupaten Sleman, Daerah Istimewa Yogyakarta',
    'Magelang Regency, Central Java': 'Kabupaten Magelang, Jawa Tengah',
    'Magelang Regency, Central Java, Indonesia': 'Kabupaten Magelang, Jawa Tengah',
    'Kulon Progo Regency, Special Region of Yogyakarta, Indonesia': 'Kabupaten Kulon Progo, Daerah Istimewa Yogyakarta',
    'yogyakarta, Special Region of Yogyakarta, Indonesia': 'Kota Yogyakarta, Daerah Istimewa Yogyakarta',
    'Yogyakarta City, Special Region of Yogyakarta, Indonesia': 'Kota Yogyakarta, Daerah Istimewa Yogyakarta',
    'Klaten Regency, Central Java, Indonesia': 'Kabupaten Klaten, Jawa Tengah',
    'Sleman Regency, Special Region of Yogyakarta': 'Kabupaten Sleman, Daerah Istimewa Yogyakarta',
    'Sleman Regency, Special Region of Yogyakarta, Indonesia': 'Kabupaten Sleman, Daerah Istimewa Yogyakarta',
    'Bantul Regency, Jawa Tengah, Indonesia': 'Kabupaten Bantul, Daerah Istimewa Yogyakarta',
    'Gunung Kidul Regency, Special Region of Yogyakarta, Indonesia': 'Kabupaten Gunung Kidul, Daerah istimewa Yogyakarta',
    'Purworejo Regency, Central Java, Indonesia': 'Kabupaten Purworejo, Jawa Tengah'
}

# Fungsi untuk melakukan translasi berdasarkan dictionary
def translate_city(city):
    return translationDict.get(city, city)

# Apply fungsi translasi ke kolom city
fixedDf['city'] = fixedDf['city'].apply(translate_city)
fixedDf.city.unique()

### workday_timing & closed_on

In [None]:
# list closed
# closed_list = ['Open All Days']

# ganti bahasa indonesia
Time_translationDict = {
    'Open All Days': 'Buka Setiap Hari',
    'Not Present': 'Tidak Tersedia'
}

# Fungsi untuk melakukan translasi berdasarkan dictionary
def translate_closed_on(closed_on):
    return Time_translationDict.get(closed_on, closed_on)

# Apply fungsi translasi ke kolom closed_on
fixedDf['closed_on'] = fixedDf['closed_on'].apply(translate_closed_on)
fixedDf.closed_on.unique()

# Panggil fungsi fillClosedOn
# fixedDf = fillClosedOn(fixedDf, closed_list, Time_translationDict)

In [None]:
fixedDf.head(20)

### most_popular_times & popular_times

In [289]:
# Fungsi untuk mengganti "Time Label" dan "Average Popularity" ke bahasa Indonesia
def translateToIndo(text):
    translations = {
        'Time Label': 'Label Waktu',
        'Average Popularity': 'Rata-rata Popularitas',
        'Not Present': 'Tidak Tersedia',
        'Sunday': 'Minggu',
        'Monday': 'Senin',
        'Tuesday': 'Selasa',
        'Wednesday': 'Rabu',
        'Thursday': 'Kamis',
        'Friday': 'Jumat',
        'Saturday': 'Sabtu',
        'Idle': 'Diam'
    }
    
    # Pastikan text adalah string sebelum memulai penggantian
    if isinstance(text, str):
        for eng, indo in translations.items():
            text = text.replace(eng, indo)
    
    return text

# Terapkan fungsi ke kolom most_popular_times
fixedDf['most_popular_times'] = fixedDf['most_popular_times'].apply(translateToIndo)
fixedDf['popular_times'] = fixedDf['popular_times'].apply(translateToIndo)

In [None]:
fixedDf.head()

In [None]:
fixedDf.info()

## Drop Kolom hidden_gem dan planning_enabled

In [None]:
# Menghapus kolom 'hidden_gem' dan 'planning_enabled'
fixedDf = fixedDf.drop(columns=['Hidden_gem', 'planning_enabled'])

# Menampilkan hasil setelah penghapusan kolom
print(fixedDf.info())
print()
fixedDf.head()

## Cek Korelasi Kolom

In [293]:

# # Function to convert workday_timing to duration in hours (e.g., "07.00-21.00")
# def convert_to_hours(timing):
#     if pd.isna(timing) or "Buka" in timing:  # Handle missing or "Buka 24 jam"
#         return 24
#     try:
#         start, end = timing.split('-')
#         start_hours = int(start.split('.')[0]) + int(start.split('.')[1]) / 60
#         end_hours = int(end.split('.')[0]) + int(end.split('.')[1]) / 60
#         return (end_hours - start_hours) if end_hours > start_hours else (24 - start_hours + end_hours)
#     except:
#         return None

# # Apply conversion
# fixedDf['workday_hours'] = fixedDf['workday_timing'].apply(convert_to_hours)

# # Convert closed_on to binary: 1 for "Buka Setiap Hari", 0 otherwise
# fixedDf['closed_on_binary'] = fixedDf['closed_on'].apply(lambda x: 1 if x == "Buka Setiap Hari" else 0)

# # Ensure reviews and rating are numeric
# fixedDf['reviews'] = pd.to_numeric(fixedDf['reviews'], errors='coerce')
# fixedDf['rating'] = pd.to_numeric(fixedDf['rating'], errors='coerce')

# # Compute the correlation matrix for the relevant columns
# correlation_columns = ['workday_hours', 'closed_on_binary', 'reviews', 'rating']
# correlation_matrix = fixedDf[correlation_columns].corr()

# # Display the correlation matrix
# print(correlation_matrix)

# # Plot heatmap of the correlation matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
# plt.title("Correlation Matrix Heatmap")
# plt.show()


In [294]:
# # Function to convert workday_timing to duration in hours (e.g., "07.00-21.00")
# def convert_to_hours(timing):
#     if pd.isna(timing) or "Buka" in timing:  # Handle missing or "Buka 24 jam"
#         return 24
#     try:
#         start, end = timing.split('-')
#         start_hours = int(start.split('.')[0]) + int(start.split('.')[1]) / 60
#         end_hours = int(end.split('.')[0]) + int(end.split('.')[1]) / 60
#         return (end_hours - start_hours) if end_hours > start_hours else (24 - start_hours + end_hours)
#     except:
#         return None

# # Apply conversion
# fixedDf['workday_hours'] = fixedDf['workday_timing'].apply(convert_to_hours)

# # Convert closed_on to binary: 1 for "Buka Setiap Hari", 0 otherwise
# fixedDf['closed_on_binary'] = fixedDf['closed_on'].apply(lambda x: 1 if x == "Buka Setiap Hari" else 0)

# # Ensure reviews and rating are numeric
# fixedDf['reviews'] = pd.to_numeric(fixedDf['reviews'], errors='coerce')
# fixedDf['rating'] = pd.to_numeric(fixedDf['rating'], errors='coerce')

# # Step 1: One-hot encode the 'types' column
# types_encoded = pd.get_dummies(fixedDf['types'], prefix='type')

# # Step 2: Combine the numeric columns with the one-hot encoded types
# data_combined = pd.concat([fixedDf[['workday_hours', 'closed_on_binary', 'reviews']], types_encoded], axis=1)

# # Step 3: Calculate the correlation matrix
# correlation_matrix = data_combined.corr()

# # Step 4: Extract correlations between numeric columns and the one-hot encoded types
# correlation_with_types = correlation_matrix.loc[['workday_hours', 'closed_on_binary', 'reviews'], types_encoded.columns]
# correlation_with_types

In [295]:
fixedDf.to_csv('data/fixedData.csv')

# Exploratory Data Analysis (EDA)

In [None]:
# Mengatur gaya visualisasi
sns.set(style="whitegrid")

# 1. Memeriksa missing values
missing_values_dfByPlace = dfByPlace.isnull().sum()
missing_values_dfNew = dfNew.isnull().sum()
missing_values_dfNewFix = dfNewFix.isnull().sum()

# 2. Visualisasi distribusi rating
plt.figure(figsize=(14, 6))

# Dataset 1
plt.subplot(1, 2, 1)
sns.histplot(dfByPlace['rating'], bins=20, kde=True)
plt.title('Distribusi Rating - Dataset 1')
plt.xlabel('Rating')
plt.ylabel('Frekuensi')

# Dataset 2
plt.subplot(1, 2, 2)
sns.histplot(dfNew['rating'], bins=20, kde=True)
plt.title('Distribusi Rating - Dataset 2')
plt.xlabel('Rating')
plt.ylabel('Frekuensi')

# Dataset 3
plt.subplot(1, 2, 2)
sns.histplot(dfNewFix['rating'], bins=20, kde=True)
plt.title('Distribusi Rating - Dataset 2')
plt.xlabel('Rating')
plt.ylabel('Frekuensi')

plt.tight_layout()
plt.show()

# 3. Visualisasi distribusi jumlah review
plt.figure(figsize=(14, 6))

# Dataset 1
plt.subplot(1, 2, 1)
sns.histplot(dfByPlace['reviews'], bins=20, kde=True)
plt.title('Distribusi Jumlah Review - Dataset 1')
plt.xlabel('Jumlah Review')
plt.ylabel('Frekuensi')

# Dataset 2
plt.subplot(1, 2, 2)
sns.histplot(dfNew['review_count'], bins=20, kde=True)
plt.title('Distribusi Jumlah Review - Dataset 2')
plt.xlabel('Jumlah Review')
plt.ylabel('Frekuensi')

# Dataset 3
plt.subplot(1, 2, 2)
sns.histplot(dfNewFix['review_count'], bins=20, kde=True)
plt.title('Distribusi Jumlah Review - Dataset 2')
plt.xlabel('Jumlah Review')
plt.ylabel('Frekuensi')

plt.tight_layout()
plt.show()

missing_values_dfByPlace, missing_values_dfNew, missing_values_dfNewFix