In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/anyas-gojo-revival-bondman/trainData.csv')
df_test = pd.read_csv('/kaggle/input/anyas-gojo-revival-bondman/testData.csv')
df.shape

In [None]:
df.head()

## EDA

In [None]:
df['is_fraud'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.dtypes

In [None]:
import matplotlib
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

In [None]:
df.hist(bins=15, figsize=(100, 100));

In [None]:
df['is_fraud'].hist(figsize = (50,30))

# Preprocessing

In [None]:
# Selecting numerical features
numerical_data = df.select_dtypes(include='number')

# Append the features of numerical_data to list
numerical_features=numerical_data.columns.tolist()

print(f'There are {len(numerical_features)} numerical features:', '\n')
numerical_features

In [None]:
# Finding correlation data
correlation=numerical_data.corr()
correlation

In [None]:
# Visualising the correlation matrix
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale=1.3)
plt.figure(figsize = (16,5))
sns.heatmap(correlation,cmap="viridis",annot=True)

In [None]:
df.drop(['first', 'last', 'trans_num', 'unix_time','cc_num','merch_lat','merch_long'], axis=1, inplace=True)
df.shape

In [None]:
def create_trans_time(df_train): 
    df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'])

    # take only usefull time
    def take_hour(x): 
         return x.hour

    df_train['trans_time'] = df_train['trans_date_trans_time'].apply(take_hour)

    df_train = df_train.drop(['trans_date_trans_time'], axis=1)
    return df_train

In [None]:
def preprocess_dob(df_train): 
    df_train['dob'] = pd.to_datetime(df_train['dob'])

    def take_period(x):
        year = x.year
        age = 2023 - year
        
        return age

    df_train['dob'] = df_train['dob'].apply(take_period)
    return df_train

In [None]:
def preprocess_gender (df_train): # gender
    df_train['gender'] = df_train['gender'].apply(lambda x: 1 if x == 'F' else 0)
    return df_train


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_and_drop(df_train, column_name, max_features=50):
    global vectorizers
    print(f'vectorizing {column_name} column')
    # Initialize the CountVectorizer
    vectorizer = CountVectorizer(max_features=max_features)

#     fit vectorizer on test dataframe
    vectorizer.fit(df_test[column_name])

    # Fit and transform the column
    vectors_train = vectorizer.fit_transform(df_train[column_name])
#     vectors_test = vectorizer.transform(df_test[column_name])

    # Convert the vectors to a DataFrame
    vectors_df_train = pd.DataFrame(vectors_train.toarray(), columns=vectorizer.get_feature_names_out())
#     vectors_df_test = pd.DataFrame(vectors_test.toarray(), columns=vectorizer.get_feature_names_out())
    
    df_train = pd.concat([df_train, vectors_df_train], axis=1)
#     df_test = pd.concat([df_test, vectors_df_test], axis=1)

    # Drop the original column
    df_train = df_train.drop(column_name, axis=1)
#     df_test = df_test.drop(column_name, axis=1)

    return df_train

In [None]:
def train_test_to_numeric(df_train): 
    df_train = vectorize_and_drop(df_train, 'category')
    df_train = vectorize_and_drop(df_train, 'job')    
    df_train = vectorize_and_drop(df_train, 'city')    
    df_train = vectorize_and_drop(df_train, 'state')
    df_train = vectorize_and_drop(df_train, 'street')
    df_train = vectorize_and_drop(df_train, 'merchant')
    
    return df_train

In [None]:
df = create_trans_time(df)
df = preprocess_dob(df)
df = preprocess_gender(df)
df = train_test_to_numeric(df)

In [None]:
# def encode(df):
#     categorical_columns = df.select_dtypes(include=['object']).columns
#     for col in categorical_columns:
#         count_map = df[col].value_counts().to_dict()

#         # Map the column with dictionary
#         df[col] = df[col].map(count_map)
    
# encode(df)

In [None]:
df.shape

In [None]:
# Separate target from predictors
y = df.is_fraud
X = df.drop(['is_fraud'], axis=1)

In [None]:
#standard scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)

In [None]:
# Divide data into training and validation subsets
from sklearn.model_selection import train_test_split
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_train_scaled, y, train_size=0.8, test_size=0.2,
                                                                random_state=0, stratify=y)


# Model Preparation and Training

In [None]:
from xgboost import XGBClassifier

# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [3, 4]
# }

# Create the XGBoost model
xgb = XGBClassifier()

# Use GridSearchCV or RandomizedSearchCV
# grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train_full, y_train)


In [None]:
# Get the best parameters
# best_params = grid_search.best_params_

In [None]:
from sklearn.metrics import accuracy_score

# Use the best parameters to create the final model
# my_model = XGBClassifier(**best_params)
my_model = XGBClassifier(n_estimators=100, random_state=42)
my_model.fit(X_train_full, y_train)

# # Make predictions on the test set
# y_pred = my_model.predict(X_test_scaled)

# # Evaluate the accuracy
# accuracy = accuracy_score(y_test, y_pred)

In [None]:
prediction = my_model.predict(X_valid_full)
prediction

In [None]:
from sklearn.metrics import roc_auc_score
roc_score = roc_auc_score(prediction, y_valid)
print(roc_score)

# Test Prediction

In [None]:
# Model prediction on the testData
test_df = pd.read_csv('/kaggle/input/anyas-gojo-revival-bondman/testData.csv')
test_df.shape

In [None]:
# preparing the test data
test_df.drop(['first', 'last', 'trans_num', 'unix_time','cc_num','merch_lat','merch_long'], axis=1, inplace=True)


In [None]:
test_df = create_trans_time(test_df)
test_df = preprocess_dob(test_df)
test_df = preprocess_gender(test_df)
test_df = train_test_to_numeric(test_df)

In [None]:
test_predict = my_model.predict(test_df)
# df_pred = pd.DataFrame(test_predict)
df_pred = list(test_predict)
# result_df = pd.concat([pd.DataFrame(test_df['id']), df_pred])
# result_df.columns = ['id', 'is_fraud']
# result_df
id_col = list(range(1,555720))
type(id_col)

In [None]:
test_df['id']

In [None]:
len(test_predict)

In [None]:
len(id_col)

In [None]:
data = {'id': id_col, 'is_fraud': test_predict}
sub_df = pd.DataFrame(data)
sub_df.set_index('id', inplace=True)
sub_df

# Submission File

In [None]:
sub_df.to_csv('submission.csv')