In [4]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
train_data=pd.read_csv('/kaggle/input/anyas-gojo-revival-bondman/trainData.csv')
train_fraud=train_data['is_fraud']
numerical_cols=[i for i in train_data.columns if train_data[i].dtype in ['int64', 'float64']]
categorical_cols=[i for i in train_data.columns if train_data[i].dtype=="object" ]
print(categorical_cols)
train_data.head()

['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num']


Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1,2019-12-15 10:35:10,3597926034019603,fraud_Ruecker Group,misc_net,7.19,Derrick,Flores,M,83690 Nicholas Ports Apt. 846,...,35.2229,-89.5518,9496,Furniture conservator/restorer,1993-03-23,2642fec2e19c9e18e61fe9a88f6bc79a,1355567710,35.189455,-90.121666,0
1,2,2019-04-10 21:49:53,4873783502705038,fraud_Lynch Ltd,shopping_pos,6.32,Elizabeth,Maxwell,F,194 Goodman Fall Apt. 569,...,48.4786,-122.3345,14871,Public house manager,1974-03-10,61cec15bdbd3db31b12f886e8e181380,1334094593,49.004316,-122.745016,0
2,3,2020-02-16 10:56:18,213161869125933,fraud_Bauch-Raynor,grocery_pos,147.59,Monica,Lane,F,3270 Scott Islands,...,44.6084,-70.6993,190,Animal nutritionist,1970-04-17,c8fcf575540e37ce0822cd040148ff57,1361012178,44.455449,-70.809148,0
3,4,2020-04-30 16:14:16,374656033243756,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,David,Lewis,M,1499 Michael Rue,...,38.8954,-77.1633,207410,Mudlogger,1984-07-03,9e9cde0e0963ac746a2cc9938e171c9a,1367338456,39.476941,-77.613438,0
4,5,2019-12-05 21:07:59,6011999606625827,fraud_Wilkinson LLC,personal_care,29.79,Ronald,Carson,M,870 Rocha Drive,...,40.9918,-73.98,4664,"Radiographer, diagnostic",1965-06-30,0742473d7f6261ce366b43ece063faf3,1354741679,40.863808,-74.674018,0


In [6]:
train_data.isnull().sum()

id                       0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523174 entries, 0 to 523173
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     523174 non-null  int64  
 1   trans_date_trans_time  523174 non-null  object 
 2   cc_num                 523174 non-null  int64  
 3   merchant               523174 non-null  object 
 4   category               523174 non-null  object 
 5   amt                    523174 non-null  float64
 6   first                  523174 non-null  object 
 7   last                   523174 non-null  object 
 8   gender                 523174 non-null  object 
 9   street                 523174 non-null  object 
 10  city                   523174 non-null  object 
 11  state                  523174 non-null  object 
 12  zip                    523174 non-null  int64  
 13  lat                    523174 non-null  float64
 14  long                   523174 non-nu

In [8]:
train_data.drop(columns=['id','trans_date_trans_time','cc_num','first','last','street','dob','trans_num'],inplace=True)
train_data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,fraud_Ruecker Group,misc_net,7.19,M,Oakland,TN,38060,35.2229,-89.5518,9496,Furniture conservator/restorer,1355567710,35.189455,-90.121666,0
1,fraud_Lynch Ltd,shopping_pos,6.32,F,Burlington,WA,98233,48.4786,-122.3345,14871,Public house manager,1334094593,49.004316,-122.745016,0
2,fraud_Bauch-Raynor,grocery_pos,147.59,F,East Andover,ME,4226,44.6084,-70.6993,190,Animal nutritionist,1361012178,44.455449,-70.809148,0
3,"fraud_Effertz, Welch and Schowalter",entertainment,32.4,M,Arlington,VA,22213,38.8954,-77.1633,207410,Mudlogger,1367338456,39.476941,-77.613438,0
4,fraud_Wilkinson LLC,personal_care,29.79,M,Harrington Park,NJ,7640,40.9918,-73.98,4664,"Radiographer, diagnostic",1354741679,40.863808,-74.674018,0


In [9]:
features=[i for i in train_data.columns if train_data[i].dtype=="object" ]
encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    ).fit(train_data[features])
train_data[features]=encoder.transform(train_data[features])
train_data.head()
train_y=train_data['is_fraud']
train_data.pop('is_fraud')

0         0
1         0
2         0
3         0
4         0
         ..
523169    0
523170    0
523171    0
523172    0
523173    0
Name: is_fraud, Length: 523174, dtype: int64

In [19]:
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7],
}

catboost = CatBoostClassifier()
grid_search = GridSearchCV(catboost, param_grid, cv=5, n_jobs=-1)
catboost.fit(train_data, train_y)

Learning rate set to 0.149209
0:	learn: 0.3743262	total: 117ms	remaining: 1m 56s
1:	learn: 0.2305251	total: 184ms	remaining: 1m 31s
2:	learn: 0.1390354	total: 246ms	remaining: 1m 21s
3:	learn: 0.0899942	total: 308ms	remaining: 1m 16s
4:	learn: 0.0655781	total: 365ms	remaining: 1m 12s
5:	learn: 0.0525994	total: 430ms	remaining: 1m 11s
6:	learn: 0.0452281	total: 493ms	remaining: 1m 9s
7:	learn: 0.0413098	total: 553ms	remaining: 1m 8s
8:	learn: 0.0378434	total: 614ms	remaining: 1m 7s
9:	learn: 0.0355283	total: 675ms	remaining: 1m 6s
10:	learn: 0.0297327	total: 741ms	remaining: 1m 6s
11:	learn: 0.0291875	total: 804ms	remaining: 1m 6s
12:	learn: 0.0284275	total: 873ms	remaining: 1m 6s
13:	learn: 0.0260829	total: 934ms	remaining: 1m 5s
14:	learn: 0.0257974	total: 994ms	remaining: 1m 5s
15:	learn: 0.0255452	total: 1.05s	remaining: 1m 4s
16:	learn: 0.0252160	total: 1.12s	remaining: 1m 4s
17:	learn: 0.0241010	total: 1.19s	remaining: 1m 4s
18:	learn: 0.0239825	total: 1.25s	remaining: 1m 4s
19:	l

<catboost.core.CatBoostClassifier at 0x7d324c3ea440>

In [26]:
test_data=pd.read_csv('/kaggle/input/anyas-gojo-revival-bondman/testData.csv')
numerical_cols=[i for i in train_data.columns if train_data[i].dtype in ['int64', 'float64']]
categorical_cols=[i for i in train_data.columns if train_data[i].dtype=="object" ]
print(categorical_cols)
test_data.head()

[]


Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long
0,1,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714
1,2,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431
2,3,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111
3,4,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061
4,5,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734


In [23]:
test_data.isnull().sum()

id                       0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
dtype: int64

In [27]:
test_data.drop(columns=['id','trans_date_trans_time','cc_num','first','last','street','dob','trans_num'],inplace=True)
features=[i for i in test_data.columns if test_data[i].dtype=="object" ]
encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    ).fit(test_data[features])
test_data[features]=encoder.transform(test_data[features])
test_data.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long
0,319.0,10.0,2.86,1.0,157.0,39.0,29209,33.9659,-80.9355,333497,275.0,1371816865,33.986391,-81.200714
1,591.0,10.0,29.84,0.0,16.0,43.0,84002,40.3207,-110.436,302,392.0,1371816873,39.450498,-109.960431
2,611.0,5.0,41.28,0.0,61.0,33.0,11710,40.6729,-73.5365,34496,259.0,1371816893,40.49581,-74.196111
3,222.0,9.0,60.05,1.0,764.0,8.0,32780,28.5697,-80.8191,54767,407.0,1371816915,28.812398,-80.883061
4,292.0,13.0,3.19,1.0,247.0,21.0,49632,44.2529,-85.017,1126,196.0,1371816917,44.959148,-85.884734


In [28]:
arr=catboost.predict(test_data)
print(arr)

[0 0 0 ... 0 0 0]


In [29]:
sum(arr)

2746

In [30]:
submission=pd.DataFrame({'id':range(1,len(arr)+1),'id_fraud':arr})
submission.to_csv('submission1.csv', index=False)