In [2]:
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
# Mengisi nilai Age yang kosong dengan rata-rata
df['Age'].fillna(df['Age'].mean(), inplace=True)
# Mengisi nilai Salary yang kosong dengan median
df['Salary'].fillna(df['Salary'].median(), inplace=True)
# Menghapus baris jika kolom 'Name' kosong
df.dropna(subset=['Name'], inplace=True)

print('Sesudah Pembersihan:\n', df)

Sesudah Pembersihan:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [4]:
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('data terstandardisasi:\n', df)

data terstandardisasi:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [2]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m1m685.7 kB/s[0m eta [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [7]:
# CELL 1: SETUP & DOWNLOAD DATA DARI KAGGLE (UBUNTU STYLE)
import os
import zipfile
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Konfigurasi Kaggle API agar membaca kaggle.json di folder saat ini
# Ini penting karena di Linux biasanya dia nyari di ~/.kaggle
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

print("--- Memulai Proses Download dari Kaggle API ---")

# 1. Cek file kaggle.json
if not os.path.exists('kaggle.json'):
    print("ERROR: File 'kaggle.json' tidak ditemukan di folder ini!")
    print("Upload dulu file kaggle.json ke folder project ini di Ubuntu.")
else:
    # 2. Ubah permission file (Standard keamanan Linux/Ubuntu)
    !chmod 600 kaggle.json
    
    # 3. Download Dataset Titanic
    # Tanda seru (!) artinya kita memerintah Terminal Linux lewat Jupyter
    !kaggle competitions download -c titanic
    
    # 4. Unzip file (biasanya turun dalam bentuk zip)
    if os.path.exists('titanic.zip'):
        with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
            zip_ref.extractall()
        print("Sukses! Dataset berhasil didownload dan diekstrak.")
    else:
        print("Gagal download atau file zip tidak ditemukan.")

print("Persiapan Selesai")

--- Memulai Proses Download dari Kaggle API ---
ERROR: File 'kaggle.json' tidak ditemukan di folder ini!
Upload dulu file kaggle.json ke folder project ini di Ubuntu.
Persiapan Selesai


In [None]:
try:
    df = pd.read_csv('train.csv')
    print("File 'train.csv' berhasil dibaca!")
except FileNotFoundError:
    print("File tidak ditemukan. Pastikan Step Download di Cell 1 sukses.")

print('\nDaftar kolom:', list(df.columns))
print('10 Data Awal:')
display(df.head(10))
tbl_missing = pd.DataFrame({
    'Jumlah Missing': df.isnull().sum(),
    'Persentase (%)': (df.isnull().mean()*100).round(2)
})
tbl_missing = tbl_missing[tbl_missing['Jumlah Missing'] > 0]
print('Tabel Missing Value:')
display(tbl_missing)

stat_num = df.describe().T
print('Statistik Numerik:')
display(stat_num)

cat_cols = df.select_dtypes(include='object').columns.tolist()
kategori_summary = pd.DataFrame({
    'Unique Values': df[cat_cols].nunique(),
    'Most Frequent': df[cat_cols].mode().iloc[0],
    'Frequency': [df[col].value_counts().iloc[0] for col in cat_cols]
})
print('Statistik Kategori:')
display(kategori_summary)
for col in ['Sex', 'Embarked']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower()
print('Contoh Data Kategori Standar:')
display(df[[col for col in ['Sex','Embarked'] if col in df.columns]].drop_duplicates().reset_index(drop=True))

jumlah_sebelum = df.shape[0]
df = df.drop_duplicates()
jumlah_sesudah = df.shape[0]
print(f'Jumlah sebelum drop duplikat: {jumlah_sebelum} | Jumlah sesudah: {jumlah_sesudah}')

for col in ['Fare', 'Age']:
    if col in df.columns:
        df[col + '_norm'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
print('Contoh Normalisasi Angka:')
display(df[[col for col in ['Fare','Fare_norm','Age','Age_norm'] if col in df.columns]].head(10))
if 'Survived' in df.columns:
    survived_table = df['Survived'].value_counts().rename({0:'Tidak Selamat',1:'Selamat'})
    survival_rate = df['Survived'].value_counts(normalize=True).rename({0:'Tidak Selamat',1:'Selamat'}).mul(100).round(2)
    print('Tabel Survival Rate:')
    display(pd.DataFrame({'Jumlah': survived_table, 'Persentase (%)': survival_rate}))
    # Berdasarkan Gender
    if 'Sex' in df.columns:
        tbl_gender = pd.crosstab(df['Sex'], df['Survived'], normalize='index').rename(columns={0:'Tidak Selamat', 1:'Selamat'}).mul(100).round(2)
        print('Survival Rate Berdasarkan Gender:')
        display(tbl_gender)
    # Berdasarkan Kelas
    if 'Pclass' in df.columns:
        tbl_class = pd.crosstab(df['Pclass'], df['Survived'], normalize='index').rename(columns={0:'Tidak Selamat', 1:'Selamat'}).mul(100).round(2)
        print('Survival Rate Berdasarkan Kelas:')
        display(tbl_class)
else:
    print('Kolom Survived tidak ditemukan.')

if 'Survived' in df.columns:
    plt.figure() 
    sns.countplot(x='Survived', data=df)
    plt.title('Jumlah Penumpang Selamat/tidak')
    plt.show()

if 'Sex' in df.columns and 'Survived' in df.columns:
    plt.figure()
    sns.countplot(x='Sex', hue='Survived', data=df)
    plt.title('Kelangsungan Hidup Berdasarkan Gender')
    plt.show()

if 'Age' in df.columns:
    plt.figure()
    sns.histplot(df['Age'].dropna(), bins=25)
    plt.title('Distribusi Umur Penumpang')
    plt.xlabel('Umur')
    plt.show()

def_corr_cols = df.select_dtypes(include='number').columns 
corr_matrix = df[def_corr_cols].corr()
plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='Blues')
plt.title('Korelasi Variabel Numerik')
plt.show()