# Tugas Kecil IF3170 Inteligensi Buatan

Anggota Kelompok:
1. Kevin John Wesley Hutabarat (13521042)
2. Jericho Russel Sebastian (13521107)

In [99]:
import pandas as pd
from abc import ABC, abstractmethod

In [100]:
# Pembacaan data
data = pd.read_csv("data/data_train.csv")
data.columns = data.columns.str.strip()

In [101]:
# Ukuran data
data.shape

(1400, 21)

In [102]:
# Head data
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,804,1,0.8,1,12,1,41,0.9,89,1,...,709,818,2027,11,5,11,1,0,0,1
1,1042,0,2.2,0,15,1,11,0.6,139,5,...,68,1018,2826,18,0,2,1,0,0,2
2,1481,1,2.0,1,0,0,35,0.5,105,3,...,249,522,2635,17,16,4,1,0,1,2
3,1104,0,1.7,0,1,1,60,0.4,199,2,...,653,1413,1229,6,0,3,1,1,1,0
4,652,0,0.5,1,1,0,58,0.6,142,3,...,464,781,565,18,12,9,0,0,1,0


In [103]:
# Definisi kelas-kelas pembantu
STATS_VERTICAL_VIEW = True
ROUNDING_LIMIT = 3

class BaseStatistics(ABC):
    def __init__(self, col_name: str) -> None:
        self.col = data[col_name]
        self.mean = self.col.mean()
        self.median = self.col.median()
        self.mode = self._calc_mode()
        self.std = self.col.std()
        self.var = self.col.var()
        self.max = self.col.max()
        self.min = self.col.min()
    
    def range(self) -> float:
        return self.max - self.min
    
    def stats(self) -> pd.DataFrame:
        return pd.DataFrame({
            'Name': ['Mean', 'Median', 'Mode', 'Standard deviation', 'Variance', 'Range', 'Minimum value', 'Maximum value'],
            'Value': [round(self.mean, ROUNDING_LIMIT), self.median, self.mode, round(self.std, ROUNDING_LIMIT), round(self.var, ROUNDING_LIMIT), round(self.range(), ROUNDING_LIMIT), self.min, self.max]
        }) if STATS_VERTICAL_VIEW else pd.DataFrame({
            'Mean': [round(self.mean, ROUNDING_LIMIT)],
            'Median': [self.median],
            'Mode': [self.mode],
            'Standard deviation': [round(self.std, ROUNDING_LIMIT)],
            'Variance': [round(self.var, ROUNDING_LIMIT)],
            'Range': [round(self.range(), ROUNDING_LIMIT)],
            'Minimum value': [self.min],
            'Maximum value': [self.max]
        })
    
    @abstractmethod
    def visualize(self) -> None:
        pass

    @abstractmethod
    def _calc_mode(self) -> float:
        pass

    def __repr__(self) -> str:
        return (
            f'Mean = {round(self.mean, ROUNDING_LIMIT)}\n'
            f'Median = {self.median}\n'
            f'Mode = {self.mode}\n'
            f'Standard deviation = {round(self.std, ROUNDING_LIMIT)}\n'
            f'Variance = {round(self.var, ROUNDING_LIMIT)}\n'
            f'Range = {round(self.range(), ROUNDING_LIMIT)}\n'
            f'Minimum value = {self.min}\n'
            f'Maximum value = {self.max}'
        )

class NumericStatistics(BaseStatistics):
    def __init__(self, col_name: str) -> None:
        super().__init__(col_name)
        self.q4 = [q for q in self.col.quantile([.25, .5, .75]).to_list()]
        self.skew = self.col.skew()
        self.kurt = self.col.kurt()
    
    def iqr(self) -> float:
        return self.q4[2] - self.q4[0]
    
    def stats(self) -> pd.DataFrame:
        stats_df = super().stats()
        if STATS_VERTICAL_VIEW:
            stats_df.loc[len(stats_df.index)] = ['Q1', round(self.q4[0], ROUNDING_LIMIT)]
            stats_df.loc[len(stats_df.index)] = ['Q2', round(self.q4[1], ROUNDING_LIMIT)]
            stats_df.loc[len(stats_df.index)] = ['Q3', round(self.q4[2], ROUNDING_LIMIT)]
            stats_df.loc[len(stats_df.index)] = ['IQR', round(self.iqr(), ROUNDING_LIMIT)]
            stats_df.loc[len(stats_df.index)] = ['Skewness', round(self.skew, ROUNDING_LIMIT)]
            stats_df.loc[len(stats_df.index)] = ['Kurtosis', round(self.kurt, ROUNDING_LIMIT)]
        else:
            row = stats_df.loc[0].to_dict()
            row['Q1'] = [round(self.q4[0], ROUNDING_LIMIT)]
            row['Q2'] = [round(self.q4[1], ROUNDING_LIMIT)]
            row['Q3'] = [round(self.q4[2], ROUNDING_LIMIT)]
            row['IQR'] = [round(self.iqr(), ROUNDING_LIMIT)]
            row['Skewness'] = [round(self.skew, ROUNDING_LIMIT)]
            row['Kurtosis'] = [round(self.kurt, ROUNDING_LIMIT)]
            stats_df = pd.DataFrame(row)
        return stats_df
    
    def visualize(self) -> None:
        pass
    
    def _calc_mode(self) -> float:
        pass

    def __repr__(self) -> str:
        return (
            f'{super().__repr__()}\n'
            f'Quartile = {[round(q, ROUNDING_LIMIT) for q in self.q4]}\n'
            f'IQR = {round(self.iqr(), ROUNDING_LIMIT)}\n'
            f'Skewness = {round(self.skew, ROUNDING_LIMIT)}\n'
            f'Kurtosis = {round(self.kurt, ROUNDING_LIMIT)}'
        )

class NonnumericStatistics(BaseStatistics):
    def visualize(self) -> None:
        pass

    def _calc_mode(self) -> float:
        self.freq_table = {}
        for d in self.col.to_list():
            if d in self.freq_table.keys():
                self.freq_table[d] += 1
            else:
                self.freq_table[d] = 1
        (curr_mode, max_f) = (None, -1)
        for d in self.freq_table:
            if self.freq_table[d] > max_f:
                (curr_mode, max_f) = (d, self.freq_table[d])
        return curr_mode

## Analisis Data

### Battery Power

In [104]:
NumericStatistics('battery_power').stats()

Unnamed: 0,Name,Value
0,Mean,1237.146
1,Median,1219.0
2,Mode,
3,Standard deviation,430.052
4,Variance,184944.538
5,Range,1497.0
6,Minimum value,501.0
7,Maximum value,1998.0
8,Q1,864.75
9,Q2,1219.0


### Blue

In [105]:
NonnumericStatistics('blue').stats()

Unnamed: 0,Name,Value
0,Mean,0.494
1,Median,0.0
2,Mode,0.0
3,Standard deviation,0.5
4,Variance,0.25
5,Range,1.0
6,Minimum value,0.0
7,Maximum value,1.0


In [106]:
#Duplicate Value

In [107]:
#Missing Value

In [108]:
#Outlier

In [109]:
#Distribusi Data dan Histogram

In [110]:
#Korelasi dengan Kolom Target