In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report



In [None]:
df = pd.read_csv("/content/fraud_merge_country.csv")

In [None]:
for col in ['signup_time','purchase_time']:
    df[col] = pd.to_datetime(df[col])

def time_convert(col):
    df[col[:-5]+ "_" + 'month'] = df[col].dt.month
    df[col[:-5]+ "_" + 'weekofyear'] = df[col].dt.weekofyear
    df[col[:-5]+ "_" + 'weekday'] = df[col].dt.weekday

for col in ['signup_time','purchase_time']:
    time_convert(col)
    
df['signTOpurchase'] = (df['purchase_time']-df['signup_time']).dt.total_seconds()

# deal with shared devices and ip
device_shared = df.device_id.value_counts()
ip_shared = df.ip_address.value_counts()
df ['device_shared'] = df.device_id.map(device_shared)
df['ip_shared']= df.ip_address.map(ip_shared)

# deal with country
country_shared = df.country.value_counts()
df['country_shared'] = df.country.map(country_shared)


In [None]:
df.drop(columns = ["signup_month", "signup_weekday", "purchase_month", "purchase_weekday", "signup_time", "purchase_time"], inplace = True)
df.head()

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,signup_weekofyear,purchase_weekofyear,signTOpurchase,device_shared,ip_shared,country_shared
0,22058,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,9,16,4506682.0,1,1,7306
1,333320,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,23,24,17944.0,1,1,58049
2,1359,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,1,1,1.0,12,12,58049
3,150084,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Unknown,18,19,492085.0,1,1,21966
4,221365,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,30,37,4361461.0,1,1,58049


In [None]:
df.drop(columns = ["user_id", "device_id", "ip_address"], inplace = True)
df.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class,country,signup_weekofyear,purchase_weekofyear,signTOpurchase,device_shared,ip_shared,country_shared
0,34,SEO,Chrome,M,39,0,Japan,9,16,4506682.0,1,1,7306
1,16,Ads,Chrome,F,53,0,United States,23,24,17944.0,1,1,58049
2,15,SEO,Opera,M,53,1,United States,1,1,1.0,12,12,58049
3,44,SEO,Safari,M,41,0,Unknown,18,19,492085.0,1,1,21966
4,39,Ads,Safari,M,45,0,United States,30,37,4361461.0,1,1,58049


In [None]:
# transform and map source
source_le = LabelEncoder()
source_labels = source_le.fit_transform(df["source"])
df["source_label"] = source_labels
source_ohe = OneHotEncoder()
source_feature_arr = source_ohe.fit_transform(
                              df[["source_label"]]).toarray()
source_feature_labels = list(source_le.classes_)
source_features = pd.DataFrame(source_feature_arr, 
                            columns=source_feature_labels)

# transform and map browser
browser_le = LabelEncoder()
browser_labels = browser_le.fit_transform(df["browser"])
df["browser_label"] = browser_labels
brow_ohe = OneHotEncoder()
brow_feature_arr = brow_ohe.fit_transform(
                              df[["browser_label"]]).toarray()
brow_feature_labels = list(browser_le.classes_)
brow_features = pd.DataFrame(brow_feature_arr, 
                            columns=brow_feature_labels)

# transform and map sex
sex_le = LabelEncoder()
sex_labels = sex_le.fit_transform(df["sex"])
df["sex_label"] = sex_labels
sex_ohe = OneHotEncoder()
sex_feature_arr = sex_ohe.fit_transform(
                              df[["sex_label"]]).toarray()
sex_feature_labels = list(sex_le.classes_)
sex_features = pd.DataFrame(sex_feature_arr, 
                            columns=sex_feature_labels)

# transform and map country
coun_le = LabelEncoder()
coun_labels = coun_le.fit_transform(df["country"])
df["coun_label"] = coun_labels
coun_ohe = OneHotEncoder()
coun_feature_arr = coun_ohe.fit_transform(
                              df[["coun_label"]]).toarray()
coun_feature_labels = list(coun_le.classes_)
coun_features = pd.DataFrame(coun_feature_arr, 
                            columns=coun_feature_labels)

new_df = pd.concat([df, source_features, brow_features, sex_features], axis=1)
new_df.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class,country,signup_weekofyear,purchase_weekofyear,signTOpurchase,device_shared,ip_shared,country_shared,source_label,browser_label,sex_label,coun_label,Ads,Direct,SEO,Chrome,FireFox,IE,Opera,Safari,F,M
0,34,SEO,Chrome,M,39,0,Japan,9,16,4506682.0,1,1,7306,2,0,1,84,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,16,Ads,Chrome,F,53,0,United States,23,24,17944.0,1,1,58049,0,0,0,171,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,15,SEO,Opera,M,53,1,United States,1,1,1.0,12,12,58049,2,3,1,171,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,44,SEO,Safari,M,41,0,Unknown,18,19,492085.0,1,1,21966,2,4,1,172,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,39,Ads,Safari,M,45,0,United States,30,37,4361461.0,1,1,58049,0,4,1,171,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [None]:
new_df.drop(columns = ["source", "browser", "sex", "country"], inplace = True)

x_df = new_df.drop(columns = ["class"]).reset_index(drop = True)
y_df = new_df[["class"]].reset_index(drop = True)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x_df, y_df, test_size = 0.3, random_state = 0)
sm = SMOTE(sampling_strategy=0.8, random_state=7)

In [None]:
"""skf = StratifiedKFold(n_splits=2,random_state=0, shuffle=True)
for train_index, test_index in skf.split(x_df, y_df):
     train_x, test_x = x_df.iloc[train_index], x_df.iloc[test_index]
     train_y, test_y = y_df.iloc[train_index], y_df.iloc[test_index]
sm = SMOTE(sampling_strategy=1, random_state=7)"""

'skf = StratifiedKFold(n_splits=2,random_state=0, shuffle=True)\nfor train_index, test_index in skf.split(x_df, y_df):\n     train_x, test_x = x_df.iloc[train_index], x_df.iloc[test_index]\n     train_y, test_y = y_df.iloc[train_index], y_df.iloc[test_index]\nsm = SMOTE(sampling_strategy=1, random_state=7)'

In [None]:
train_x.shape

(105778, 22)

In [None]:
train_y.shape

(105778, 1)

In [None]:
over_X, over_y = sm.fit_resample(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [None]:
model = AdaBoostClassifier(n_estimators = 600,learning_rate = 1,algorithm = "SAMME.R")
model.fit(over_X, over_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=600, random_state=None)

In [None]:
model.score(over_X, over_y)

0.9680008816142821

In [None]:
model.score(test_x, test_y)

0.9585741386156086

In [None]:
from sklearn.metrics import confusion_matrix
t = test_y
p = model.predict(test_x)
confusion_matrix(t, p)

array([[41175,     2],
       [ 1876,  2281]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y,p))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41177
           1       1.00      0.55      0.71      4157

    accuracy                           0.96     45334
   macro avg       0.98      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334



In [None]:
tn, fp, fn, tp = confusion_matrix(t, p).ravel()