In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
data = pd.read_csv("raw_data.csv")
# print(data.columns)
if 'identifierHash' in data.columns:
    data.drop('identifierHash', axis=1, inplace=True)
if 'countryCode' in data.columns:
    data.drop('countryCode', axis=1, inplace=True)

#data.drop(['identifierHash','type','country','language','hasAnyApp','civilityTitle','civilityGenderId','seniorityAsMonths','seniorityAsYears','countryCode','productsWished','productsBought','hasAndroidApp','hasIosApp'],axis=1,inplace=True)
# Initialize encoders
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)
#print(data['gender'].unique())
converted = pd.get_dummies(data['gender'], drop_first=1)
data = pd.concat([data, converted], axis = 1)
data.drop('gender',axis = 1, inplace=True)
data.rename(columns={'M': 'Male'}, inplace=True)
#print(data['Male'].unique())
data = data.fillna(0)

# Check the data types of each column in the DataFrame
#print(data.dtypes)

# Identify numeric columns (int and float types)
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
boolean_columns = ['hasAnyApp', 'hasAndroidApp', 'hasIosApp', 'hasProfilePicture', 'Male']
# Handle non-numeric columns separately (e.g., keep them as they are, or convert them to category type)
# Example: Convert object-type columns to category type if necessary
object_columns = data.select_dtypes(include='object').columns
for col in object_columns:
    data[col] = data[col].astype('category')
# Convert boolean columns to integers
for col in boolean_columns:
    data[col] = data[col].astype(int)

# Convert only numeric columns to integer types
data[numeric_columns] = data[numeric_columns].astype(int)

categorical_columns = ['type', 'country', 'language']
# Convert categorical columns using one-hot encoding
encoded_data = one_hot_encoder.fit_transform(data[categorical_columns])

# Create a DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Combine the one-hot encoded columns with the original data, excluding the original categorical columns
data = pd.concat([data, encoded_df], axis=1).drop(categorical_columns, axis=1)

def zscore(array):
    thr = 3
    mean = np.mean(array)
    std = np.std(array)
    z_scores = (array - mean) / std
    return np.abs(z_scores) > thr


combined_condition = ~(zscore(data['socialNbFollows']) | zscore(data['socialNbFollowers']) | zscore(data['productsListed']) | zscore(data['productsSold']) | zscore(data['socialProductsLiked']))
data = data[combined_condition]
data.reset_index(drop=True, inplace=True)
#print(new_data.head())
#print(data.describe())


def pure_round(num):
    integer = int(num)
    fraction = num - float(integer)
    if fraction >= 0.5:
        integer += 1
    return integer

data = data[data['productsListed'] != 0]
for i in data.index:
    case_no = data.loc[i,'productsSold']
    pass_no = pure_round((case_no * data.loc[i,'productsPassRate']) / 100)
    fail_no = case_no - pass_no
    data.loc[i,'productsPassed'] = pass_no
    data.loc[i,'productsFailed'] = fail_no
    if case_no == 0:
        data.drop(i, axis=0, inplace=True)
 
data.drop(['productsPassRate','productsSold'], axis=1, inplace=True)

# Encodedict = {}
# for i in ['Male','hasProfilePicture']:
#     key = '_{}'.format(i)
#     le = LabelEncoder()
#     data[key] = le.fit_transform(list(data[i]))
#     Encodedict[key] = le.classes_

# data.drop(['Male','hasProfilePicture'], axis=1, inplace=True)

dfdict = {}
for j in data.index:
    x = data.loc[j,'productsPassed']
    y = data.loc[j,'productsFailed']
    if x != 0:
        data.loc[j,'Fraud'] = 0
        df = pd.DataFrame(data.loc[j,:]).transpose()
        ldf = pd.concat([df]*int(x), ignore_index=True)
    
    if y != 0:
        data.loc[j,'Fraud'] = 1
        df2 = pd.DataFrame(data.loc[j,:]).transpose()
        ldf2 = pd.concat([df2]*int(y), ignore_index=True)
    
    if x != 0 and y != 0:
        dfdict[j] = pd.concat([ldf, ldf2], ignore_index=True)
    elif x != 0:
        dfdict[j] = ldf
    else:
        dfdict[j] = ldf2

data_new = pd.concat(dfdict.values(), ignore_index=True)
data_new.drop(['productsPassed','productsFailed'], axis=1, inplace=True)
#print(data_new.describe())


from sklearn.model_selection import train_test_split

# Assuming 'Fraud' is your target variable
X = data_new.drop('Fraud', axis=1)  # Features
y = data_new['Fraud']  # Target variable
# Step 1: Convert the 'Fraud' column to numeric values, coercing any non-numeric values to NaN
data_new['Fraud'] = pd.to_numeric(data_new['Fraud'], errors='coerce')

# Step 2: Convert the 'Fraud' column to int64
data_new['Fraud'] = data_new['Fraud'].astype('int64')
#plt.figure(figsize=(4, 4))
#sns.countplot(x='productsListed', hue='Fraud', data=data_new, palette = 'inferno')
#sns.boxplot(x='productsListed', y='socialNbFollowers', hue='Fraud', data=data, palette = 'inferno')
#sns.histplot(x = 'seniority', data = data_new, binwidth = 350, hue = 'Fraud', element = 'step')
# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, you can reset index for train and test sets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
#print(data_new.head())
# Ensure y_train contains only numeric values, and convert non-numeric values to NaN
y_train = pd.to_numeric(y_train, errors='coerce')

# Handle any missing values (NaN) as needed
# For example, you can drop rows with missing values or fill them with a default value:
y_train = y_train.dropna()  # Drop rows with NaN values

# Convert the series to integer type
y_train = y_train.astype(int)


X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
y_test = y_test.astype(int)
X_test.fillna(X_test.mean(), inplace=True)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (965, 223)
y_train shape: (965,)


In [26]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from bayes_opt import BayesianOptimization
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import numpy as np

# Import other necessary libraries
import warnings
warnings.filterwarnings("ignore")


In [34]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.impute import SimpleImputer
import numpy as np

# Define your training and test data
# ...

# Create an imputer to fill missing values in the input data
imputer = SimpleImputer(strategy='mean')

# Impute missing values in the training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Define the optimization function for GaussianNB
def optimize_nb(priors):
    # Normalize priors so that their sum equals 1
    priors_normalized = np.array([priors] * len(np.unique(y_train)))
    priors_normalized /= priors_normalized.sum()
    
    # Create a GaussianNB model with the given priors
    clf = GaussianNB(priors=priors_normalized)
    
    # Train the model with imputed training data
    clf.fit(X_train_imputed, y_train)
    
    # Make predictions on the imputed test data
    y_pred = clf.predict(X_test_imputed)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Define the parameter bounds for the optimization
pbounds = {'priors': (0, 1)}  # Adjust the range as needed

# Instantiate Bayesian Optimization
nb_bo = BayesianOptimization(
    f=optimize_nb,
    pbounds=pbounds,
    random_state=42,
)

# Run Bayesian optimization
nb_bo.maximize(init_points=5, n_iter=10)

# Print the best hyperparameters and result
print("Best parameters for GaussianNB:", nb_bo.max)


|   iter    |  target   |  priors   |
-------------------------------------
| [0m1        [0m | [0m0.4339   [0m | [0m0.3745   [0m |
| [0m2        [0m | [0m0.4339   [0m | [0m0.9507   [0m |
| [0m3        [0m | [0m0.4339   [0m | [0m0.732    [0m |
| [0m4        [0m | [0m0.4339   [0m | [0m0.5987   [0m |
| [0m5        [0m | [0m0.4339   [0m | [0m0.156    [0m |
| [0m6        [0m | [0m0.4339   [0m | [0m0.5032   [0m |
| [0m7        [0m | [0m0.4339   [0m | [0m0.1075   [0m |
| [0m8        [0m | [0m0.4339   [0m | [0m0.3356   [0m |
| [0m9        [0m | [0m0.4339   [0m | [0m0.9998   [0m |
| [0m10       [0m | [0m0.4339   [0m | [0m1.674e-05[0m |
| [0m11       [0m | [0m0.4339   [0m | [0m0.7241   [0m |
| [0m12       [0m | [0m0.4339   [0m | [0m0.05731  [0m |
| [0m13       [0m | [0m0.4339   [0m | [0m0.6952   [0m |
| [0m14       [0m | [0m0.4339   [0m | [0m0.7577   [0m |
| [0m15       [0m | [0m0.4339   [0m | [0m0.6526   

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from bayes_opt import BayesianOptimization
import numpy as np
# Define the preprocessing pipeline
imputer = SimpleImputer(strategy='mean')  # Change strategy if needed
scaler = StandardScaler()  # Optional: standardizes the data

preprocessing_pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler)
])
# Preprocess the training and test data
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)
def optimize_lr(C):
    clf = LogisticRegression(C=C, solver='liblinear')
    clf.fit(X_train_processed, y_train)
    y_pred = clf.predict(X_test_processed)
    return accuracy_score(y_test, y_pred)
# Define the Bayesian Optimization for LogisticRegression
lr_bo = BayesianOptimization(
    f=optimize_lr,
    pbounds={'C': (0.001, 10)},
    random_state=42,
)

# Run Bayesian Optimization
lr_bo.maximize(init_points=5, n_iter=10)

# Print the best hyperparameters
print("Best parameters for LogisticRegression:", lr_bo.max)


|   iter    |  target   |     C     |
-------------------------------------
| [0m1        [0m | [0m0.5909   [0m | [0m3.746    [0m |
| [0m2        [0m | [0m0.5868   [0m | [0m9.507    [0m |
| [0m3        [0m | [0m0.5909   [0m | [0m7.32     [0m |
| [0m4        [0m | [0m0.5909   [0m | [0m5.987    [0m |
| [0m5        [0m | [0m0.5909   [0m | [0m1.561    [0m |
| [0m6        [0m | [0m0.5702   [0m | [0m0.001    [0m |
| [0m7        [0m | [0m0.5909   [0m | [0m6.496    [0m |
| [0m8        [0m | [0m0.5909   [0m | [0m2.584    [0m |
| [0m9        [0m | [0m0.5909   [0m | [0m4.809    [0m |
| [0m10       [0m | [0m0.5868   [0m | [0m8.34     [0m |
| [0m11       [0m | [0m0.5909   [0m | [0m2.03     [0m |
| [0m12       [0m | [0m0.5909   [0m | [0m3.189    [0m |
| [0m13       [0m | [0m0.5909   [0m | [0m5.372    [0m |
| [0m14       [0m | [0m0.5909   [0m | [0m4.277    [0m |
| [0m15       [0m | [0m0.5868   [0m | [0m10.0     

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create an imputer to handle missing values in the data
imputer = SimpleImputer(strategy='mean')  # or use other strategies like 'median' or 'most_frequent'

# Impute missing values in the training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Define the best hyperparameter found
best_C = 3.7460266483547775
# Create a logistic regression model using the best hyperparameter
clf = LogisticRegression(C=best_C, solver='liblinear')

# Train the model on your imputed training data
clf.fit(X_train_imputed, y_train)

# Make predictions on the imputed test data
y_pred = clf.predict(X_test_imputed)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Best Logistic Regression Model Accuracy:", accuracy)

# Print classification report for further insight
print("Classification Report:")
print(classification_report(y_test, y_pred))


Best Logistic Regression Model Accuracy: 0.5867768595041323
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.72      0.66       136
           1       0.54      0.42      0.47       106

    accuracy                           0.59       242
   macro avg       0.57      0.57      0.57       242
weighted avg       0.58      0.59      0.58       242



In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.impute import SimpleImputer
import numpy as np

# Define your training and test data
# Ensure your data (X_train, X_test, y_train, y_test) is loaded and preprocessed
# ...

# Create an imputer to fill missing values in the input data
imputer = SimpleImputer(strategy='mean')

# Impute missing values in the training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Define the objective function for KNeighborsClassifier
def optimize_knn(n_neighbors, weights):
    # Cast n_neighbors to integer
    n_neighbors = int(n_neighbors)
    
    # Convert weights value to 'uniform' or 'distance' based on threshold
    if weights < 0.5:
        weights_str = 'uniform'
    else:
        weights_str = 'distance'
    
    # Create KNeighborsClassifier model
    clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights_str)
    
    # Fit the model on the imputed training data
    clf.fit(X_train_imputed, y_train)
    
    # Make predictions on the imputed test data
    y_pred = clf.predict(X_test_imputed)
    
    # Calculate accuracy
    return accuracy_score(y_test, y_pred)

# Define the Bayesian Optimization for KNeighborsClassifier
knn_bo = BayesianOptimization(
    f=optimize_knn,
    pbounds={'n_neighbors': (1, 20), 'weights': (0, 1)},
    random_state=42,
)

# Run Bayesian Optimization
knn_bo.maximize(init_points=5, n_iter=10)

# Convert the best weights value to its string equivalent ("uniform" or "distance")
best_weights = 'uniform' if knn_bo.max['params']['weights'] < 0.5 else 'distance'

# Print the best hyperparameters for KNeighborsClassifier
print("Best parameters for KNeighborsClassifier:", {'n_neighbors': int(knn_bo.max['params']['n_neighbors']), 'weights': best_weights})

# Define the objective function for RandomForestClassifier
def optimize_rf(n_estimators, max_depth):
    # Cast n_estimators and max_depth to integers
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    # Create RandomForestClassifier model
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    
    # Fit the model on the imputed training data
    clf.fit(X_train_imputed, y_train)
    
    # Make predictions on the imputed test data
    y_pred = clf.predict(X_test_imputed)
    
    # Calculate accuracy
    return accuracy_score(y_test, y_pred)

# Define the Bayesian Optimization for RandomForestClassifier
rf_bo = BayesianOptimization(
    f=optimize_rf,
    pbounds={'n_estimators': (10, 200), 'max_depth': (3, 30)},
    random_state=42,
)

# Run Bayesian Optimization
rf_bo.maximize(init_points=5, n_iter=10)

# Print the best hyperparameters for RandomForestClassifier
print("Best parameters for RandomForestClassifier:", {'n_estimators': int(rf_bo.max['params']['n_estimators']), 'max_depth': int(rf_bo.max['params']['max_depth'])})


|   iter    |  target   | n_neig... |  weights  |
-------------------------------------------------
| [0m1        [0m | [0m0.7231   [0m | [0m8.116    [0m | [0m0.9507   [0m |
| [0m2        [0m | [0m0.7025   [0m | [0m14.91    [0m | [0m0.5987   [0m |
| [0m3        [0m | [0m0.657    [0m | [0m3.964    [0m | [0m0.156    [0m |
| [95m4        [0m | [95m0.7438   [0m | [95m2.104    [0m | [95m0.8662   [0m |
| [0m5        [0m | [0m0.7066   [0m | [0m12.42    [0m | [0m0.7081   [0m |
| [0m6        [0m | [0m0.7438   [0m | [0m2.161    [0m | [0m0.8446   [0m |
| [0m7        [0m | [0m0.7149   [0m | [0m1.475    [0m | [0m0.0      [0m |
| [0m8        [0m | [0m0.6074   [0m | [0m9.474    [0m | [0m0.0      [0m |
| [0m9        [0m | [0m0.7107   [0m | [0m6.83     [0m | [0m1.0      [0m |
| [0m10       [0m | [0m0.5785   [0m | [0m17.97    [0m | [0m0.04947  [0m |
| [0m11       [0m | [0m0.595    [0m | [0m13.64    [0m | [0m0.0005843

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Use the best parameters for KNeighborsClassifier
best_n_neighbors = 2  # From Bayesian Optimization result
best_weights = 'distance'  # From Bayesian Optimization result

# Create an instance of KNeighborsClassifier with the best parameters
knn_clf = KNeighborsClassifier(n_neighbors=best_n_neighbors, weights=best_weights)

# Fit the model on the imputed training data
knn_clf.fit(X_train_imputed, y_train)

# Make predictions on the imputed test data
y_pred_knn = knn_clf.predict(X_test_imputed)

# Calculate accuracy for KNeighborsClassifier
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy of KNeighborsClassifier: {knn_accuracy:.4f}")

# Use the best parameters for RandomForestClassifier
best_n_estimators = 144  # From Bayesian Optimization result
best_max_depth = 19  # From Bayesian Optimization result

# Create an instance of RandomForestClassifier with the best parameters
rf_clf = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)

# Fit the model on the imputed training data
rf_clf.fit(X_train_imputed, y_train)

# Make predictions on the imputed test data
y_pred_rf = rf_clf.predict(X_test_imputed)

# Calculate accuracy for RandomForestClassifier
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of RandomForestClassifier: {rf_accuracy:.4f}")


Accuracy of KNeighborsClassifier: 0.7438
Accuracy of RandomForestClassifier: 0.7562
