# Análisis y balanceo de datos

## Imports

In [None]:
from Bibliotecas import *
%matplotlib inline
from Modelos import *
from Comparador import *
from ParameterTuning import *

### Timing

In [None]:
inicio_ppio = time.time()

# ANALISIS DE DATOS

## Levantamos los datos

In [None]:
df = pd.read_csv("./Fraud.csv")

df.head()

## Tamaño del dataset

In [None]:
df.shape

## Tipos de los datos

In [None]:
df.dtypes

## Estadisticas descriptivas de los datos

In [None]:
df.describe()

## Se borran las columnas 'nameOrig' y 'nameDest'

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

## Revisamos si hay valores perdidos (None, NaN) en el resto del dataset

In [None]:
df.isnull().sum()

## Correlación de los datos

### Correlacion de los datos contra la variable 'isFraud'

In [None]:
df.corr()["isFraud"].sort_values()

### Correlacion de los datos entre si

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)

## Cantidad de fraudes y no fraudes que hay en el dataset

In [None]:
df["isFraud"].value_counts()

## Cantidad de fraudes y no fraudes que hay en el dataset (Normalizado)

In [None]:
df["isFraud"].value_counts(normalize=True)

## Cantidad de tipos de transacciones que hay en el dataset

In [None]:
df['type'].value_counts()

## Transacciones fraudulentas y no fraudulentas diferenciadas por su tipo

In [None]:
plt.figure(figsize=(15, 8))
sns.countplot(x="type", data=df, hue="isFraud")

## Porcentajes de transacciones fraudulentas de cada tipo de transaccion

In [None]:
# https://docs.python.org/3/library/collections.html#collections.Counter
df_type_fraud = pd.DataFrame(dict(Counter(df['type'])).items(), columns=['type', 'IsFraud'])

pie_porcentaje_transacciones_fraudulentas = px.pie(df_type_fraud, values="IsFraud", names='type', title='Transacciones Fraudulentas', color_discrete_sequence=px.colors.sequential.RdBu)
pie_porcentaje_transacciones_fraudulentas.show()

## Mapeo el type a números

In [None]:
mapping_type = {'CASH_IN': 0,'CASH_OUT': 1,'PAYMENT': 2,'TRANSFER': 3,'DEBIT': 4}
df['type_numeric'] = df.type.map(mapping_type)
df.drop('type', inplace=True, axis=1)

In [None]:
X = df.drop('isFraud',axis=1)
y = df[['isFraud']]

## Separación de los datos de entrenamiento (80%) y datos para testing (20%)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# joblib.dump(X_train, 'X_train.joblib')
# joblib.dump(X_test, 'X_test.joblib')
# joblib.dump(y_train, 'y_train.joblib')
# joblib.dump(y_test, 'y_test.joblib')

In [None]:
X_train = joblib.load('X_train.joblib')
X_test = joblib.load('X_test.joblib')
y_train = joblib.load('y_train.joblib')
y_test = joblib.load('y_test.joblib')

# Pruebas con datos Balanceados

armo el dataframe completo con sólo los datos de train

In [None]:
df_train = X_train.copy()
df_train['isFraud'] = y_train

In [None]:
# Count classes and plot
target_count = df_train["isFraud"].value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
target_count.plot(kind='bar', title='Count (isFraud)');

## Random undersampling

In [None]:
# Class count
target_0_count, target_1_count=df_train["isFraud"].value_counts()
# Seperate classes
target_0 = df_train[df_train["isFraud"] == 0]
target_1 = df_train[df_train["isFraud"] == 1]

# Resample target1 to match target 0 count
target_0_undersample = target_0.sample(target_1_count)
# Merge back to single df
test_undersample = pd.concat([target_0_undersample, target_1], axis=0)
# Show counts and plot
print('Random under-sampling:')
test_undersample["isFraud"].value_counts().plot(kind='bar', title='Count (target)');

In [None]:
X_undersample_train = test_undersample.drop('isFraud',axis=1)
y_undersample_train = test_undersample[['isFraud']]

In [None]:
joblib.dump(X_undersample_train, 'X_undersample_train.joblib')
joblib.dump(y_undersample_train, 'y_undersample_train.joblib')

In [None]:
len(X_undersample_train)

In [None]:
y_undersample_train.value_counts()

In [None]:
y_test.value_counts()

## Random oversampling

In [None]:
# Class count
target_0_count, target_1_count = df_train["isFraud"].value_counts()
# Seperate classes
target_0 = df_train[df_train["isFraud"] == 0]
target_1 = df_train[df_train["isFraud"] == 1]

# Resample target0 to match target 1 count
target_1_oversample = target_1.sample(target_0_count, replace=True)
# Merge back to single df
test_oversample = pd.concat([target_1_oversample, target_0], axis=0)
# Show counts and plot
print('Random over-sampling:')
print(test_oversample["isFraud"].value_counts())
test_oversample["isFraud"].value_counts().plot(kind='bar', title='Count (isFraud)');


In [None]:
X_oversample_train = test_oversample.drop('isFraud',axis=1)
y_oversample_train = test_oversample[['isFraud']]

In [None]:
joblib.dump(X_oversample_train, 'X_oversample_train.joblib')
joblib.dump(y_oversample_train, 'y_oversample_train.joblib')

## SMOTE

In [None]:
oversample = SMOTE()
X_smote_train, y_smote_train = oversample.fit_resample(X, y)

In [None]:
joblib.dump(X_smote_train, 'X_smote_train.joblib')
joblib.dump(y_smote_train, 'y_smote_train.joblib')