In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix




<h1>DATA UNDERSTANDING AND CLEANING</h1>

In [None]:
data = pd.read_csv('data/h1n1_vaccine_prediction.csv')
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.drop(['unique_id'], axis=1, inplace=True) 

In [None]:
data.isna().sum().sort_values(ascending=False).plot(kind='bar', figsize=(10,5))

In [None]:
data.drop(['race','sex','income_level','housing_status','census_msa'],axis=1, inplace=True)

menghilangkan kolom yang tidak akan berpengaruh pada hasil predikis vaksin

In [None]:
for col in data.columns:
    if data[col].isna().sum()>0:
        data[col].fillna(data[col].mode()[0],inplace=True)

In [None]:
data.isnull().sum()

In [None]:
categorical=data.select_dtypes(object).columns
data=pd.get_dummies(data,columns=categorical)
# jika hasil yang di dapatkan nanti hasilnya turun maka drop_first=True bisa di ganti menjadi False

In [None]:
data.head()


In [None]:
data.shape

In [None]:
data.columns = (
    data.columns
      .str.lower()
      .str.strip()
      .str.replace(' ', '_')
)



<h1>EXPLORATORY DATA ANALYIS</h1>

In [None]:

adult_counts = data['no_of_adults'].value_counts().sort_index()
plt.figure(figsize=(7, 7))
plt.pie(
    adult_counts,
    labels=adult_counts.index,
    autopct='%1.2f%%',
    startangle=90
)

plt.title('Jumlah Orang Dewasa dalam Rumah Tangga', fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(12, 10))

# ---------------- Subplot 1 ----------------

plt.subplot(2, 2, 1)
sns.countplot(x="no_of_adults",data=data, palette="pastel")
plt.title('Jumlah Orang Dewasa dalam Rumah Tangga')
plt.xlabel('no_of_adults')

# ---------------- Subplot 2 ----------------
plt.subplot(2, 2, 2)
sns.countplot(x="is_h1n1_vacc_effective",data=data, palette="pastel")
plt.title('Penilaian Efektivitas Vaksin H1N1')
plt.xlabel('is_h1n1_vacc_effective')

# ---------------- Subplot 3 ----------------
plt.subplot(2, 2, 3)
sns.countplot(x="has_health_insur",data=data, palette="pastel")
plt.title('Kepemilikan Asuransi Kesehatan')
plt.xlabel('has_health_insur')

# ---------------- Subplot 4 ----------------
plt.subplot(2, 2, 4)
sns.countplot(x="antiviral_medication",data=data, palette="pastel")
plt.title('Konsumsi Obat Antivirus')
plt.xlabel('antiviral_medication')

# Mengatur layout agar tidak saling bertabrakan
plt.tight_layout()

# Menampilkan semua plot
plt.show()

In [None]:
plt.figure
sns.countplot(
    data=data,
    x='marital_status_not_married',
    hue='h1n1_vaccine',
    palette='pastel'
)

plt.show()

In [None]:
plt.figure
sns.countplot(
    data=data,
    x='is_health_worker',
    hue='h1n1_vaccine',
    palette='pastel'
)

plt.show()

In [None]:
plt.figure
sns.countplot(
    data=data,
    x='h1n1_worry',
    hue='h1n1_vaccine',
    palette='pastel'
)

plt.show()

In [None]:
plt.figure
sns.countplot(
    data=data,
    x='h1n1_worry',
    hue='wash_hands_frequently',
    palette='pastel'
)

plt.show()

In [None]:
plt.figure
sns.countplot(x="h1n1_vaccine",data=data, palette="pastel")
plt.show()

data yang di tampilkan menunjukan bawha data ini tidak seimbang (imbalanced), jadi disini penilit menggunakan teknik SMOTE untuk menyemimbangkan data

In [None]:
x = data.drop("h1n1_vaccine", axis=1)
y = data["h1n1_vaccine"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
oversample=SMOTE()
x_smote, y_smote=oversample.fit_resample(x_train,y_train)

In [None]:
x_smote.shape

In [None]:
def report(actual,pred):
    acc_score=accuracy_score(actual,pred)
    con_mat=confusion_matrix(actual,pred)
    class_rep=classification_report(actual,pred)
    print("Accuracy of model is", round((acc_score*100),2),'%')
    print('\n ',con_mat)
    print('\n',class_rep)

In [None]:
# feature & target
x = data.drop("h1n1_vaccine", axis=1)
y = data["h1n1_vaccine"]

# 1. SPLIT DATA
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# # 2. SCALING
# # scaler = StandardScaler()
# # x_train = scaler.fit_transform(x_train)
# # x_test  = scaler.transform(x_test)

# 3. SMOTE (HANYA TRAIN)
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

rf=RandomForestClassifier(n_estimators=130,max_depth=12,min_samples_split=8)
rf.fit(x_smote,y_smote)
train_rf=rf.predict(x_smote)
test_rf=rf.predict(x_test)
report(y_smote,train_rf)
report(y_test,test_rf)


In [None]:
# # feature & target
x = data.drop("h1n1_vaccine", axis=1)
y = data["h1n1_vaccine"]

# 1. SPLIT DATA
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=123
)

# 2. SCALING
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test  = scaler.transform(x_test)

# 3. SMOTE (HANYA TRAIN)
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

nb=GaussianNB()
nb.fit(x_train, y_train)
pred_train_nb=nb.predict(x_train)
pred_tetst_nb=nb.predict(x_test)
report(y_train, pred_train_nb)
report(y_test, pred_tetst_nb)


In [None]:
# # feature & target
x = data.drop("h1n1_vaccine", axis=1)
y = data["h1n1_vaccine"]

# 1. SPLIT DATA
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=123
)

# 2. SCALING
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test  = scaler.transform(x_test)

# 3. SMOTE (HANYA TRAIN)
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

svm = SVC(kernel='rbf', C=10, gamma='scale', probability=True)
svm.fit(x_train, y_train)
train_svm=svm.predict(x_train)
test_svm=svm.predict(x_test)
report(y_train,train_svm)
report(y_test,test_svm)


In [None]:
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)