In [1]:
# main.py
from lib import KNN
from lib import NaiveBayes
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from scipy import stats
# Kode aplikasi utama

In [2]:
df = pd.read_csv('data_train.csv')

In [3]:
df 

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,804,1,0.8,1,12,1,41,0.9,89,1,...,709,818,2027,11,5,11,1,0,0,1
1,1042,0,2.2,0,15,1,11,0.6,139,5,...,68,1018,2826,18,0,2,1,0,0,2
2,1481,1,2.0,1,0,0,35,0.5,105,3,...,249,522,2635,17,16,4,1,0,1,2
3,1104,0,1.7,0,1,1,60,0.4,199,2,...,653,1413,1229,6,0,3,1,1,1,0
4,652,0,0.5,1,1,0,58,0.6,142,3,...,464,781,565,18,12,9,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,536,1,1.4,0,0,1,53,0.7,135,3,...,547,705,1211,15,10,7,1,0,1,0
1396,1097,0,0.8,0,10,1,21,0.1,160,7,...,1277,1352,2219,15,6,12,1,0,1,2
1397,1179,1,0.5,0,7,1,32,0.3,182,2,...,85,1451,340,16,5,16,1,0,0,0
1398,719,1,0.5,1,0,1,23,0.4,113,6,...,431,1727,3990,14,9,12,1,1,1,3


## Data Preprocessing

In [4]:
#separate categorical and numerical features
numerical_features = ['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']
categorical_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
target_feature = ['price_range']

In [5]:
def descriptive_statistic(df,features):
     ft = []
     mean = []
     median = []
     mode = []
     std_dev = []
     variance = []
     data_range = []
     min_val = []
     max_val = []
     q1 = []
     q3 = []
     iqr = []
     skewness = []
     kurtosis = []
     for feature in features:
          # Inisiasi kolom feature
          df_column = df[feature]
          ft.append(feature)
          # Hitung nilai mean
          mean.append(df_column.mean())

          # Hitung nilai median
          median.append(df_column.median())

          # Hitung nilai modus
          mode.append(df_column.mode()[0])

          # Hitung nilai standar deviasi
          std_dev.append(df_column.std())

          # Hitung nilai variansi
          variance.append(df_column.var())

          # Hitung nilai range
          data_range.append(df_column.max() - df_column.min())

          # Hitung nilai minimum
          min_val.append(df_column.min())

          # Hitung nilai maksimum
          max_val.append(df_column.max())

          # Hitung kuartil pertama dan ketiga
          q1.append(np.percentile(df_column, 25))
          q3.append(np.percentile(df_column, 75))

          # Hitung interquartile range (IQR)
          iqr.append(np.percentile(df_column, 75) - np.percentile(df_column, 25))

          # Hitung nilai skewness
          skewness.append(stats.skew(df_column))

          # Hitung nilai kurtosis
          kurtosis.append(stats.kurtosis(df_column))

     return pd.DataFrame({ 'fitur' : ft,
            'mean' : mean,
            'median' : median,
            'modus' : mode,
            'std_dev' : std_dev,
            'variansi' : variance,
            'range' : data_range,
            'min' : min_val,
            'max' : max_val,
            'q1' : q1,
            'q3' : q3,
            'IQR' : iqr,
            'skewness' : skewness,
            'kurtosis' : kurtosis
    }).round(3)

In [6]:

#rumus outlier : q1 - (1.5 * iqr) < data < q3 + (1.5 * iqr)
def outlier_range(q1,q3,iqr):
    lower = q1 - (1.5 * iqr)
    upper = q3 + (1.5 * iqr)
    return lower, upper

In [22]:
#plot the outlier
''' 
@params:
    descriptive_statistic = descriptive_statistic(df_train, numerical_features) , from descriptive_statistic function
    df = df_train
'''
def outlier_plot(descriptive_statistic, df):
    outliers = pd.DataFrame(columns=df.columns)  # Initialize an empty DataFrame to store outliers
    for idx, row in descriptive_statistic.iterrows():
        lower, upper = outlier_range(row['q1'], row['q3'], row['IQR'])
        feature = row['fitur']

        # Filter outliers for the current feature and append to the outlier DataFrame
        lower_outliers = df[df[feature] < lower]
        upper_outliers = df[df[feature] > upper]
        outliers = pd.concat([outliers, lower_outliers, upper_outliers])

    return outliers

In [8]:
desc_stats = descriptive_statistic(df, numerical_features)
desc_stats

Unnamed: 0,fitur,mean,median,modus,std_dev,variansi,range,min,max,q1,q3,IQR,skewness,kurtosis
0,battery_power,1237.146,1219.0,772.0,430.052,184944.538,1497.0,501.0,1998.0,864.75,1602.0,737.25,0.042,-1.168
1,clock_speed,1.522,1.5,0.5,0.815,0.664,2.5,0.5,3.0,0.7,2.2,1.5,0.166,-1.329
2,fc,4.275,3.0,0.0,4.324,18.698,19.0,0.0,19.0,1.0,7.0,6.0,1.019,0.288
3,int_memory,31.962,32.0,27.0,18.163,329.893,62.0,2.0,64.0,16.0,48.0,32.0,0.063,-1.227
4,m_dep,0.508,0.5,0.1,0.289,0.083,0.9,0.1,1.0,0.2,0.8,0.6,0.059,-1.267
5,mobile_wt,139.376,139.0,182.0,35.401,1253.217,120.0,80.0,200.0,108.0,169.0,61.0,0.02,-1.21
6,n_cores,4.481,4.0,4.0,2.28,5.198,7.0,1.0,8.0,2.0,7.0,5.0,0.02,-1.232
7,pc,9.917,10.0,10.0,6.08,36.967,20.0,0.0,20.0,5.0,15.0,10.0,0.029,-1.164
8,px_height,643.178,561.0,88.0,444.629,197694.93,1960.0,0.0,1960.0,273.75,950.25,676.5,0.659,-0.319
9,px_width,1251.717,1247.0,1247.0,428.983,184026.286,1498.0,500.0,1998.0,876.5,1627.5,751.0,0.004,-1.176


In [23]:
outlier = outlier_plot(desc_stats, df)
outlier

  outliers = pd.concat([outliers, lower_outliers, upper_outliers])


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
53,1290,1,1.4,1,19,1,35,0.3,110,4,...,405,742,879,16,2,8,1,0,0,0
98,1731,1,2.3,1,18,0,60,0.5,171,4,...,142,1039,1220,9,3,20,0,1,0,1
413,946,1,2.6,1,17,0,5,0.1,166,3,...,1698,1771,3720,15,7,4,0,1,0,3
603,1348,0,2.0,0,18,0,52,0.3,98,3,...,1869,1942,955,18,11,7,1,1,1,1
778,1533,1,1.1,1,18,1,17,0.3,160,4,...,1054,1393,2520,8,2,11,1,0,1,2
797,1772,1,1.6,0,17,1,45,0.5,159,2,...,837,1405,1146,6,1,17,1,1,0,1
1035,1708,1,2.4,1,18,1,49,0.1,109,1,...,233,517,3388,6,4,16,1,1,1,3
1072,1137,1,1.0,0,18,0,7,1.0,196,3,...,942,1179,3616,13,5,12,1,1,1,3
1150,695,0,0.5,0,18,1,12,0.6,196,2,...,1649,1829,2855,16,13,7,1,1,1,2
1232,1957,0,1.2,1,18,1,36,0.8,151,2,...,1194,1727,1115,16,2,18,1,0,1,1


In [25]:
outlier.index

Index([53, 98, 413, 603, 778, 797, 1035, 1072, 1150, 1232, 1342], dtype='int64')

In [30]:
df.iloc[[53, 98, 413]]


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
53,1290,1,1.4,1,19,1,35,0.3,110,4,...,405,742,879,16,2,8,1,0,0,0
98,1731,1,2.3,1,18,0,60,0.5,171,4,...,142,1039,1220,9,3,20,0,1,0,1
413,946,1,2.6,1,17,0,5,0.1,166,3,...,1698,1771,3720,15,7,4,0,1,0,3


In [35]:
df.drop(outlier.index, inplace=True)

KeyError: '[53, 98, 413, 603, 778, 797, 1035, 1072, 1150, 1232, 1342] not found in axis'

In [None]:
df


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,804,1,0.8,1,12,1,41,0.9,89,1,...,709,818,2027,11,5,11,1,0,0,1
1,1042,0,2.2,0,15,1,11,0.6,139,5,...,68,1018,2826,18,0,2,1,0,0,2
2,1481,1,2.0,1,0,0,35,0.5,105,3,...,249,522,2635,17,16,4,1,0,1,2
3,1104,0,1.7,0,1,1,60,0.4,199,2,...,653,1413,1229,6,0,3,1,1,1,0
4,652,0,0.5,1,1,0,58,0.6,142,3,...,464,781,565,18,12,9,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,536,1,1.4,0,0,1,53,0.7,135,3,...,547,705,1211,15,10,7,1,0,1,0
1396,1097,0,0.8,0,10,1,21,0.1,160,7,...,1277,1352,2219,15,6,12,1,0,1,2
1397,1179,1,0.5,0,7,1,32,0.3,182,2,...,85,1451,340,16,5,16,1,0,0,0
1398,719,1,0.5,1,0,1,23,0.4,113,6,...,431,1727,3990,14,9,12,1,1,1,3


In [1]:
#Now, drop the column of fc that have strong correlation with pc
df_new = df.drop('fc',axis=1)

NameError: name 'df' is not defined

In [None]:
features = df_new.drop('')

#  Build The Model

## 1. K Nearest Neighbor

## 2. Naive Bayes