In [1]:
import numpy as np #biblioteca utilizada para trabalhar com vetores
import pandas as pd #biblioteca para trabalhar com dataframes (formato de planilhas excel)
from sklearn.model_selection import train_test_split  #biblioteca para separar nossa base de dados deixando uma parte para treino do modelo e outra para teste


In [2]:
#biblioteca para realizar a construção do algoritmo supervisionado Random Forest
from sklearn.ensemble import RandomForestClassifier

### Problema: Detecção de transações fraudulentas

In [3]:
# abrir o arquivo csv que contém os dados a serem utilizados durante a prática - fonte: Kaggle
df_creditcard = pd.read_csv('creditcard.csv')

In [4]:
#visualizando as 5 primeiras linhas do banco de dados
df_creditcard.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


##### obs.: Exibindo as colunas do dateset. As colunas foram anonimizadas através do PCA para preservar as informações iniciais. Os únicos dados que estão de forma ‘original’ é a coluna Time (Tempo entre a primeira transação do dateset e próxima), Amout (Valor da transação) e Class (Flag para definir o que é fraude)

#### Quantos valores nulos (NAN) existem no dataset?


In [5]:
df_creditcard.isnull().sum() #função utilizada para contar a quantidade de valores nulos existente em cada uma das colunas (características) do dataset

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

#### Quantas colunas (características) e linhas (instâncias) existem no dataset?

In [6]:
df_creditcard.shape

(284807, 31)

#### Quantas fraude e não fraudes temos na nossa base de treino?

In [7]:
print('Quantidade de transações não fraudulentas: ', df_creditcard['Class'].value_counts().values[0])
print('Quantidade de transações fraudulentas: ', df_creditcard['Class'].value_counts().values[1])

Quantidade de transações não fraudulentas:  284315
Quantidade de transações fraudulentas:  492


#### Separando as features (características) da variável resposta(rótulo/classe)

In [8]:
X = df_creditcard.iloc[:, :-1] 
#iloc é usado quando queremos selecionar dados de linhas e colunas pelo seu valor numérico de índice.
# pega todas as linhas e todas as colunas - exceto a última coluna
#posições do final começam com -1

y = df_creditcard['Class']

#### Separando dados para treino e dados para teste

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0) 


In [10]:
X_train.shape, X_test.shape

((199364, 30), (85443, 30))

In [11]:
X_test.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
183484,125821.0,-0.323334,1.057455,-0.048341,-0.607204,1.259821,-0.091761,1.159101,-0.124335,-0.17464,...,0.186409,-0.207098,-0.43389,-0.261613,-0.046651,0.211512,0.008297,0.108494,0.161139,40.0
255448,157235.0,-0.349718,0.932619,0.142992,-0.657071,1.169784,-0.733369,1.009985,-0.071069,-0.302083,...,-0.096502,-0.271537,-0.833209,-0.03036,0.490035,-0.404816,0.13435,0.07683,0.175562,1.98
244749,152471.0,-1.614711,-2.40657,0.326194,0.66552,2.369268,-1.775367,-1.139049,0.329904,0.903813,...,0.419835,0.701399,1.134489,0.965054,0.640981,-1.801998,-1.041114,0.286285,0.437322,96.0
63919,50927.0,-2.477184,0.860613,1.44185,1.051019,-1.856621,2.078384,0.510828,-0.243399,-0.260691,...,-0.98779,0.810408,0.692245,0.150121,-0.260777,0.005183,-0.177847,-0.51006,-0.660533,308.0
11475,19899.0,1.338831,-0.547264,0.737389,-0.212383,-1.110039,-0.525744,-0.801403,-0.063672,0.997276,...,-0.126871,-0.139436,-0.074719,0.067055,0.333122,0.379087,-0.268706,-0.002769,0.003272,5.0


#### Treinando o modelo - lembre da base que separamos para treino

In [12]:
model = RandomForestClassifier()#instanciar o objeto do classificador RandomForest
model.fit(X_train, y_train)#treinando o modelo


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### Testando o modelo - lembre da base que separamos para teste

In [13]:
y_pred = model.predict(X_test)

#### Nossa base de teste já recebeu as classificações do modelo, vejamos:

In [14]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

#### Vamos juntar esses dados!

In [17]:
X_test_com_classificação = X_test #Criando um dataset que irá conter os dados da base de teste a também a classificação dada pelo modelo

In [18]:
 X_test_com_classificação['classificação'] = y_pred # só acrescentando uma coluna que irá conter as classificações dadas pelo modelo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Dê uma olhadinha na última coluna que acabamos de acrescentar!

In [20]:
X_test_com_classificação.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,classificação
183484,125821.0,-0.323334,1.057455,-0.048341,-0.607204,1.259821,-0.091761,1.159101,-0.124335,-0.17464,...,-0.207098,-0.43389,-0.261613,-0.046651,0.211512,0.008297,0.108494,0.161139,40.0,0
255448,157235.0,-0.349718,0.932619,0.142992,-0.657071,1.169784,-0.733369,1.009985,-0.071069,-0.302083,...,-0.271537,-0.833209,-0.03036,0.490035,-0.404816,0.13435,0.07683,0.175562,1.98,0
244749,152471.0,-1.614711,-2.40657,0.326194,0.66552,2.369268,-1.775367,-1.139049,0.329904,0.903813,...,0.701399,1.134489,0.965054,0.640981,-1.801998,-1.041114,0.286285,0.437322,96.0,0
63919,50927.0,-2.477184,0.860613,1.44185,1.051019,-1.856621,2.078384,0.510828,-0.243399,-0.260691,...,0.810408,0.692245,0.150121,-0.260777,0.005183,-0.177847,-0.51006,-0.660533,308.0,0
11475,19899.0,1.338831,-0.547264,0.737389,-0.212383,-1.110039,-0.525744,-0.801403,-0.063672,0.997276,...,-0.139436,-0.074719,0.067055,0.333122,0.379087,-0.268706,-0.002769,0.003272,5.0,0
