importando todos os pacotes necessários para a execução do código

In [4]:
# Importando os pacotes necessários
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Etapa 1 (pré processamentos dos dados)

Definimos todos os dados cujo o valor é igual a "?" como nulos

In [5]:
train_set = pd.read_csv('train_data.csv', na_values="?")
test_set = pd.read_csv('test_data.csv', na_values="?")

In [6]:
train_set.head()


Unnamed: 0,Id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,16280,34,Private,204991,Some-college,10,Divorced,Exec-managerial,Own-child,White,Male,0,0,44,United-States,<=50K
1,16281,58,Local-gov,310085,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
2,16282,25,Private,146117,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,42,United-States,<=50K
3,16283,24,Private,138938,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,16284,57,Self-emp-inc,258883,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,5178,0,60,Hungary,>50K


In [7]:
test_set.head()

Unnamed: 0,Id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,0,25,Private,120596,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,44,United-States
1,1,64,State-gov,152537,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
2,2,31,Private,100135,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Female,0,0,40,United-States
3,3,45,Private,189123,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States
4,4,64,Self-emp-inc,487751,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States


Verificamos que há valores nulos no dataset.

In [8]:
train_set.isnull().sum()

Id                   0
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

Removemos todos os dados que sejam nulos do nosso dataset

In [9]:
train_set = train_set.dropna()

Fazendo a verificação se ainda existem dados nulos

In [10]:
train_set.isnull().sum()

Id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

Retiramos do dataset as colunas "id" e "fnlwght"

In [11]:
train_set = train_set.drop(columns=["Id", "fnlwgt"])
test_set = test_set.drop(columns=["Id", "fnlwgt"])

Eliminamos 2 atributos (capital.gain e capital.loss) em decorrencia da criação do novo atributo capital.net, que é a subtração dos atributos anteriores

In [12]:
train_set["capital.net"] = train_set["capital.gain"] - train_set["capital.loss"]
train_set = train_set.drop(columns=["capital.gain", "capital.loss"])

test_set["capital.net"] = test_set["capital.gain"] - test_set["capital.loss"]
test_set = test_set.drop(columns=["capital.gain", "capital.loss"])

Nesta célula separamos as colunas restantes designadas para os valores dos atributos em X_train, enquanto y_train será responsável por indicar a classe pertecente

In [13]:
X_train = train_set[train_set.columns.to_list()]
X_train = X_train.drop(columns=["income"])
y_train = train_set[["income"]]
X_test = test_set[test_set.columns.to_list()]

Efetuando a normalização dos dados, para que nenhum atributo possua um peso muito maior que o outro

In [14]:
def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(LabelEncoder().fit_transform)

def standardization(df: pd.DataFrame) -> pd.DataFrame:
    return StandardScaler().fit_transform(df)

def preprocess(original: pd.DataFrame):
    df = original.copy()
    # Aplica o label enconder a todos os valores do dataframe 
    df = encode_labels(df)
    # Normaliza os valores obtidos anteriormente
    np_array = standardization(df)
    result = pd.DataFrame(np_array, columns=original.columns)
    return result

X_train = preprocess(X_train)
X_test = preprocess(X_test)
y_train = encode_labels(y_train)
display(X_train.head())
display(y_train.head())

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,capital.net
0,-0.337931,-0.208884,1.223988,-0.047537,-1.722362,-0.734609,0.987703,0.385055,0.692823,0.265757,0.264929,-0.173058
1,1.490591,-1.257334,-2.710661,-1.616201,-0.387243,1.747138,-0.885732,0.385055,0.692823,-0.074037,0.264929,-0.173058
2,-1.023627,-0.208884,1.223988,-0.047537,0.947877,0.009915,-0.261254,0.385055,0.692823,0.09586,0.264929,-0.173058
3,-1.099815,-0.208884,1.223988,-0.047537,-1.722362,-1.479133,-0.261254,0.385055,-1.44337,-0.074037,0.264929,-0.173058
4,1.414403,0.839567,0.174749,-0.439703,-0.387243,1.747138,-0.885732,0.385055,0.692823,1.624934,-3.174669,2.840563


Unnamed: 0,income
0,0
1,0
2,0
3,0
4,1


# Etapa 2 (KNN)

Iniciamos o algoritimo knn e definimos o número de "vizinhos" a serem verifivados  

In [15]:
knn = KNeighborsClassifier()
distributions = dict(n_neighbors = [11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33])
classifier = GridSearchCV(knn, distributions)

Checagem do melhor parâmetro a se considerar

In [16]:
knn_search = classifier.fit(X_train, y_train)
knn_search.best_params_

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


{'n_neighbors': 17}

Exibição da pontuação do melhor parâmetro

In [17]:
# acurácia do modelo
knn_search.best_score_

0.8421802653926832

In [18]:
results = knn_search.best_estimator_.predict(X_test)

In [19]:
X_test.head(n=10)

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,capital.net
0,-0.994408,-0.252895,-0.326974,1.140012,0.90529,0.582388,-0.274375,0.389725,0.706553,0.299522,0.24802,-0.168172
1,1.822584,1.616261,-0.326974,1.140012,-0.418711,-0.802953,-0.902178,0.389725,0.706553,0.381187,0.24802,-0.168172
2,-0.561025,-0.252895,0.445764,1.529506,-1.742712,-0.802953,-0.274375,0.389725,-1.415322,-0.027138,0.24802,-0.168172
3,0.450203,-0.252895,0.188185,-0.417965,0.90529,-0.110283,0.981232,0.389725,0.706553,-0.027138,0.24802,-0.168172
4,1.822584,0.370157,-0.326974,1.140012,-0.418711,-0.802953,-0.902178,0.389725,0.706553,0.789513,0.24802,-0.168172
5,-1.066639,2.862365,0.188185,-0.417965,0.90529,1.736839,-0.274375,-1.990589,0.706553,1.606163,0.24802,-0.168172
6,0.08905,0.993209,0.445764,1.529506,-0.418711,0.582388,-0.902178,0.389725,0.706553,0.789513,0.24802,-0.168172
7,0.08905,-0.252895,-0.069395,2.308495,-0.418711,-0.802953,-0.902178,0.389725,0.706553,-0.027138,0.24802,-0.168172
8,-0.705486,-0.252895,1.218502,-0.02847,-0.418711,-0.802953,-0.902178,0.389725,0.706553,1.197838,0.24802,-0.168172
9,-0.199872,-0.252895,-0.326974,1.140012,-1.742712,-1.495624,1.609035,-1.990589,-1.415322,-0.027138,0.24802,-0.168172


In [20]:
def to_income_string(item):
  return '<=50K' if item == 0 else '>50K'

predictions = [to_income_string(result) for result in results]
predictions = np.array(predictions, dtype='object')

In [21]:
predictions

array(['<=50K', '>50K', '>50K', ..., '<=50K', '<=50K', '>50K'],
      dtype=object)

In [22]:
submissions = pd.DataFrame()
submissions[0] = test_set.index
submissions[1] = predictions
submissions.columns = ['Id', 'income']

In [23]:
submissions.head(n=10)

Unnamed: 0,Id,income
0,0,<=50K
1,1,>50K
2,2,>50K
3,3,<=50K
4,4,>50K
5,5,<=50K
6,6,>50K
7,7,>50K
8,8,<=50K
9,9,<=50K


In [24]:
submissions.to_csv('submission.csv', index=False)