# Lab 2: Vektorizacija i predprocesiranje podataka

## Učitavanja Penguins skupa podataka

In [29]:
import seaborn as sns
import pandas as pd
df = sns.load_dataset('penguins')
print(f'Originalni oblik: {df.shape}')
df.head()

Originalni oblik: (344, 7)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [30]:
df_encoded = pd.get_dummies(df, columns=['species','island', 'sex'], dummy_na=False)
print(f'Oblik nakon vektorizacije kategorickih kolona: {df_encoded.shape}')

Oblik nakon vektorizacije kategorickih kolona: (344, 12)


In [31]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
num_cols = df_encoded.select_dtypes(include=['float64','int64']).columns.tolist()
print('Nedostajuće vrednosti prije imputacije:')
print(df_encoded[num_cols].isnull().sum())
df_encoded[num_cols] = imp.fit_transform(df_encoded[num_cols])
print('Nedostajuće vrednosti nakon imputacije:')
print(df_encoded[num_cols].isnull().sum())

Nedostajuće vrednosti prije imputacije:
bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
dtype: int64
Nedostajuće vrednosti nakon imputacije:
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
dtype: int64


In [32]:
import numpy as np
mean_bl = df_encoded['bill_length_mm'].mean()
std_bl = df_encoded['bill_length_mm'].std()
mask = np.abs(df_encoded['bill_length_mm'] - mean_bl) < 3 * std_bl
df_clean = df_encoded[mask].copy()
print(f'Broj redova nakon uklanjanja outliera: {df_clean.shape[0]}')

Broj redova nakon uklanjanja outliera: 344


In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])
df_clean.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_Female,sex_Male
0,-0.8870812,0.7877425,-1.422488,-0.565789,True,False,False,False,False,True,False,True
1,-0.813494,0.1265563,-1.065352,-0.503168,True,False,False,False,False,True,True,False
2,-0.6663195,0.4317192,-0.422507,-1.192003,True,False,False,False,False,True,True,False
3,-1.307172e-15,1.806927e-15,0.0,0.0,True,False,False,False,False,True,False,False
4,-1.328605,1.092905,-0.565361,-0.941517,True,False,False,False,False,True,True,False


---

## Zadatak: priprema i obrada Adult skupa podataka

1. **Učitavanje realnog skupa**: Preuzmite Adult dataset sa UCI: https://archive.ics.uci.edu/ml/datasets/adult
2. **Vektorizacija kategorickih podataka**: Primijeniti `LabelEncoder` za ciljni atribut (`income`) i `pd.get_dummies` za ostale kategoričle kolone (npr. `education`, `marital-status`).
3. **Imputacija**: Zamenite `?` sa NaN i iskoristiti `SimpleImputer` sa strategijom `most_frequent` za kategorije i `mean` za numeričke atribute.
4. **Outlieri**: Definisati prag od 3 standardne devijacije za numeričke kolone (npr. `hours-per-week`) i ukloniti outliere.
5. **Skaliranje**: Primijeniti `StandardScaler` ili `MinMaxScaler` na numeričke atribute.
6. **Evaluacija**: Podijeliti podatke na training/test i uporediti performanse `KNeighborsClassifier` (Accuracy, Precision, Recall) prije i poslije obrade podataka (nakon koraka 2 i nakon koraka 5).

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [3]:
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'sex',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, names=column_names, na_values=' ?', skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
X_raw = df.drop('income', axis=1).copy()
y_raw = LabelEncoder().fit_transform(df['income'])

In [5]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)

In [6]:
cat_cols = X_raw.select_dtypes(include=['object']).columns
num_cols = X_raw.select_dtypes(include=['int64', 'float64']).columns

In [7]:
def prepare(df):
    df2 = df.copy()
    # impute numerical
    num_imp = SimpleImputer(strategy='mean')
    df2[num_cols] = num_imp.fit_transform(df2[num_cols])
    # encode categorical
    for col in cat_cols:
        df2[col] = df2[col].fillna('Missing')
        df2[col] = LabelEncoder().fit_transform(df2[col])
    return df2

X_train_raw_p = prepare(X_train_raw)
X_test_raw_p = prepare(X_test_raw)

In [8]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_raw_p, y_train_raw)
y_pred_raw = knn.predict(X_test_raw_p)

print('--- Raw Data Performance ---')
print(f'Accuracy:  {accuracy_score(y_test_raw, y_pred_raw):.4f}')
print(f'Precision: {precision_score(y_test_raw, y_pred_raw):.4f}')
print(f'Recall:    {recall_score(y_test_raw, y_pred_raw):.4f}')

--- Raw Data Performance ---
Accuracy:  0.7781
Precision: 0.5700
Recall:    0.3265


In [9]:
df_full = df.copy()
df_full.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
num_imp = SimpleImputer(strategy='mean')
df_full[num_cols] = num_imp.fit_transform(df_full[num_cols])
cat_imp = SimpleImputer(strategy='most_frequent')
df_full[cat_cols] = cat_imp.fit_transform(df_full[cat_cols])

In [11]:
hpw = df_full['hours_per_week']
mask = np.abs(hpw - hpw.mean()) < 3 * hpw.std()
df_full = df_full[mask]

In [12]:
df_full = pd.get_dummies(df_full, columns=cat_cols, drop_first=True)

In [13]:
scaler = StandardScaler()
df_full[num_cols] = scaler.fit_transform(df_full[num_cols])

In [14]:
X = df_full.drop('income', axis=1)
y = LabelEncoder().fit_transform(df_full['income'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train, y_train)
y_pred = knn2.predict(X_test)

print('\n--- Prepared Data Performance ---')
print(f'Accuracy:  {accuracy_score(y_test, y_pred):.4f}')
print(f'Precision: {precision_score(y_test, y_pred):.4f}')
print(f'Recall:    {recall_score(y_test, y_pred):.4f}')


--- Prepared Data Performance ---
Accuracy:  0.8333
Precision: 0.6798
Recall:    0.6115
