In [1]:
import pandas as pd
import sklearn

base = pd.read_csv('credit-data.csv')

base.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [2]:
# Retorna a coluna Age que tem valor negativo
base.loc[base['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [3]:
# Apagar coluna age do pandas
#base.drop('age', 1, inplace=True)

In [4]:
# Apagar linhas do pandas que tem age negativo
#base.drop(base[base.age < 0].index, inplace=True)

In [5]:
# Caso uma coluna possua valores errados, substituir elas pela media dos outros.
# Retorna a media de todas as colunas
base.mean()

clientid     1000.500000
income      45331.600018
age            40.807559
loan         4444.369695
default         0.141500
dtype: float64

In [6]:
# Media da coluna age
base['age'].mean()


40.80755937840458

In [7]:
# Media da coluna age sem contar os registros negativos
media = base['age'][base.age > 0].mean()

In [8]:
# Para os registros negativos, substitui pela media dos outros registros
base.loc[base.age < 0, 'age'] = media


In [9]:
# Nao existem mais registros com age negativo
base.loc[base['age'] < 0]
base.describe()


Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.9277,4444.369695,0.1415
std,577.494589,14326.327119,13.261825,3045.410024,0.348624
min,1.0,20014.48947,18.055189,1.37763,0.0
25%,500.75,32796.459717,29.072097,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [10]:
# Retorna todas as linhas que tem a coluna age com valor nulo
base.loc[pd.isnull(base['age'])]


Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [11]:
# Retorna todas as linhas, da coluna 1 ate a coluna 3
previsores = base.iloc[:,1:4].values
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [12]:
#Retorna todas as linhas, e a ultima coluna
classe = base.iloc[:, 4].values
classe


array([0, 0, 0, ..., 1, 0, 0])

In [13]:
# Faz o tratamento de valores nulos no DataFrame do Pandas
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

# Seleciona todas as linhas, da coluna 1 ate a coluna 3 para retirar nulls
imputer = imputer.fit(previsores[:, 0:3])



In [14]:
# Elimina todos valores nulos do DataFrame
previsores[:, 0:3] = imputer.transform(previsores[:,0:3])
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [15]:
# Quando os valores entre as colunas tem muita diferenca. Melhor Normalizar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [16]:
# Normaliza todos as linhas e colunas do DataFrame
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538005,  1.20281942],
       [-0.76217555,  0.54265932,  0.69642695],
       [ 0.83682073,  1.67417101,  1.17471147],
       ...,
       [-0.07122592, -0.97448606,  0.35420081],
       [-0.11000289,  1.73936652, -0.92675625],
       [ 1.682986  ,  1.14917551,  0.96381038]])