In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
data = pd.read_csv("raw_data.csv")
# print(data.columns)
if 'identifierHash' in data.columns:
    data.drop('identifierHash', axis=1, inplace=True)
if 'countryCode' in data.columns:
    data.drop('countryCode', axis=1, inplace=True)

#data.drop(['identifierHash','type','country','language','hasAnyApp','civilityTitle','civilityGenderId','seniorityAsMonths','seniorityAsYears','countryCode','productsWished','productsBought','hasAndroidApp','hasIosApp'],axis=1,inplace=True)
# Initialize encoders
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)
#print(data['gender'].unique())
converted = pd.get_dummies(data['gender'], drop_first=1)
data = pd.concat([data, converted], axis = 1)
data.drop('gender',axis = 1, inplace=True)
data.rename(columns={'M': 'Male'}, inplace=True)
#print(data['Male'].unique())
data = data.fillna(0)

# Check the data types of each column in the DataFrame
#print(data.dtypes)

# Identify numeric columns (int and float types)
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
boolean_columns = ['hasAnyApp', 'hasAndroidApp', 'hasIosApp', 'hasProfilePicture', 'Male']
# Handle non-numeric columns separately (e.g., keep them as they are, or convert them to category type)
# Example: Convert object-type columns to category type if necessary
object_columns = data.select_dtypes(include='object').columns
for col in object_columns:
    data[col] = data[col].astype('category')
# Convert boolean columns to integers
for col in boolean_columns:
    data[col] = data[col].astype(int)

# Convert only numeric columns to integer types
data[numeric_columns] = data[numeric_columns].astype(int)

categorical_columns = ['type', 'country', 'language']
# Convert categorical columns using one-hot encoding
encoded_data = one_hot_encoder.fit_transform(data[categorical_columns])

# Create a DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Combine the one-hot encoded columns with the original data, excluding the original categorical columns
data = pd.concat([data, encoded_df], axis=1).drop(categorical_columns, axis=1)

def zscore(array):
    thr = 3
    mean = np.mean(array)
    std = np.std(array)
    z_scores = (array - mean) / std
    return np.abs(z_scores) > thr


combined_condition = ~(zscore(data['socialNbFollows']) | zscore(data['socialNbFollowers']) | zscore(data['productsListed']) | zscore(data['productsSold']) | zscore(data['socialProductsLiked']))
data = data[combined_condition]
data.reset_index(drop=True, inplace=True)
#print(new_data.head())
#print(data.describe())


def pure_round(num):
    integer = int(num)
    fraction = num - float(integer)
    if fraction >= 0.5:
        integer += 1
    return integer

data = data[data['productsListed'] != 0]
for i in data.index:
    case_no = data.loc[i,'productsSold']
    pass_no = pure_round((case_no * data.loc[i,'productsPassRate']) / 100)
    fail_no = case_no - pass_no
    data.loc[i,'productsPassed'] = pass_no
    data.loc[i,'productsFailed'] = fail_no
    if case_no == 0:
        data.drop(i, axis=0, inplace=True)
 
data.drop(['productsPassRate','productsSold'], axis=1, inplace=True)

# Encodedict = {}
# for i in ['Male','hasProfilePicture']:
#     key = '_{}'.format(i)
#     le = LabelEncoder()
#     data[key] = le.fit_transform(list(data[i]))
#     Encodedict[key] = le.classes_

# data.drop(['Male','hasProfilePicture'], axis=1, inplace=True)

dfdict = {}
for j in data.index:
    x = data.loc[j,'productsPassed']
    y = data.loc[j,'productsFailed']
    if x != 0:
        data.loc[j,'Fraud'] = 0
        df = pd.DataFrame(data.loc[j,:]).transpose()
        ldf = pd.concat([df]*int(x), ignore_index=True)
    
    if y != 0:
        data.loc[j,'Fraud'] = 1
        df2 = pd.DataFrame(data.loc[j,:]).transpose()
        ldf2 = pd.concat([df2]*int(y), ignore_index=True)
    
    if x != 0 and y != 0:
        dfdict[j] = pd.concat([ldf, ldf2], ignore_index=True)
    elif x != 0:
        dfdict[j] = ldf
    else:
        dfdict[j] = ldf2

data_new = pd.concat(dfdict.values(), ignore_index=True)
data_new.drop(['productsPassed','productsFailed'], axis=1, inplace=True)
#print(data_new.describe())


from sklearn.model_selection import train_test_split

# Assuming 'Fraud' is your target variable
X = data_new.drop('Fraud', axis=1)  # Features
y = data_new['Fraud']  # Target variable
# Step 1: Convert the 'Fraud' column to numeric values, coercing any non-numeric values to NaN
data_new['Fraud'] = pd.to_numeric(data_new['Fraud'], errors='coerce')

# Step 2: Convert the 'Fraud' column to int64
data_new['Fraud'] = data_new['Fraud'].astype('int64')
#plt.figure(figsize=(4, 4))
#sns.countplot(x='productsListed', hue='Fraud', data=data_new, palette = 'inferno')
#sns.boxplot(x='productsListed', y='socialNbFollowers', hue='Fraud', data=data, palette = 'inferno')
#sns.histplot(x = 'seniority', data = data_new, binwidth = 350, hue = 'Fraud', element = 'step')
# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, you can reset index for train and test sets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
#print(data_new.head())
# Ensure y_train contains only numeric values, and convert non-numeric values to NaN
y_train = pd.to_numeric(y_train, errors='coerce')

# Handle any missing values (NaN) as needed
# For example, you can drop rows with missing values or fill them with a default value:
y_train = y_train.dropna()  # Drop rows with NaN values

# Convert the series to integer type
y_train = y_train.astype(int)


X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
y_test = y_test.astype(int)
X_test.fillna(X_test.mean(), inplace=True)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (965, 223)
y_train shape: (965,)


In [23]:
# Import necessary libraries
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# Optional: import Bayesian Optimization library
from bayes_opt import BayesianOptimization
import numpy as np


# Define imputation and encoding strategies for numerical and categorical features
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize the data
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encoding for categorical data
])

# Define which features are numerical and categorical
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create a ColumnTransformer to handle different feature types
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create a pipeline with the preprocessor and the classifier
# Define the classifiers
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
log_reg = LogisticRegression()

# Create pipelines with preprocessor and classifier for each model
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),  # Apply PCA to retain 95% of variance
    ('clf', knn)
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),  # Apply PCA to retain 95% of variance
    ('clf', rf)
])

log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),  # Apply PCA to retain 95% of variance
    ('clf', log_reg)
])

# Fit each model pipeline to the training data
knn_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
log_reg_pipeline.fit(X_train, y_train)

# Predict using the best model from each pipeline
y_pred_knn = knn_pipeline.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_log_reg = log_reg_pipeline.predict(X_test)

# Calculate accuracy and print classification report
print("KNN")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)}")
print(classification_report(y_test, y_pred_knn))

print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(classification_report(y_test, y_pred_log_reg))




KNN
Accuracy: 0.5950413223140496
              precision    recall  f1-score   support

           0       0.64      0.63      0.64       136
           1       0.54      0.55      0.54       106

    accuracy                           0.60       242
   macro avg       0.59      0.59      0.59       242
weighted avg       0.60      0.60      0.60       242


Random Forest
Accuracy: 0.7355371900826446
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       136
           1       0.72      0.65      0.68       106

    accuracy                           0.74       242
   macro avg       0.73      0.73      0.73       242
weighted avg       0.73      0.74      0.73       242


Logistic Regression
Accuracy: 0.6157024793388429
              precision    recall  f1-score   support

           0       0.63      0.75      0.69       136
           1       0.58      0.44      0.50       106

    accuracy                           0.62       242
 



In [22]:
!pip install bayesian-optimization


Defaulting to user installation because normal site-packages is not writeable
Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [24]:
# Define the parameter grid and distributions for each model
knn_param_grid = {
    'clf__n_neighbors': [3, 5, 7, 9, 11],
    'clf__weights': ['uniform', 'distance']
}

rf_param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10]
}

log_reg_param_grid = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__solver': ['liblinear', 'saga']
}

# Perform Grid Search for each model
grid_search_knn = GridSearchCV(knn_pipeline, knn_param_grid, cv=5, scoring='accuracy')
grid_search_rf = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='accuracy')
grid_search_log_reg = GridSearchCV(log_reg_pipeline, log_reg_param_grid, cv=5, scoring='accuracy')

# Fit the Grid Search models on the training data
grid_search_knn.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_log_reg.fit(X_train, y_train)

# Predict using the best model from each Grid Search
y_pred_knn_grid = grid_search_knn.best_estimator_.predict(X_test)
y_pred_rf_grid = grid_search_rf.best_estimator_.predict(X_test)
y_pred_log_reg_grid = grid_search_log_reg.best_estimator_.predict(X_test)

# Calculate accuracy and print classification report
print("\nKNN - Grid Search")
print(f"Best parameters: {grid_search_knn.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn_grid)}")
print(classification_report(y_test, y_pred_knn_grid))

print("\nRandom Forest - Grid Search")
print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf_grid)}")
print(classification_report(y_test, y_pred_rf_grid))

print("\nLogistic Regression - Grid Search")
print(f"Best parameters: {grid_search_log_reg.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg_grid)}")
print(classification_report(y_test, y_pred_log_reg_grid))





KNN - Grid Search
Best parameters: {'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Accuracy: 0.6983471074380165
              precision    recall  f1-score   support

           0       0.70      0.81      0.75       136
           1       0.69      0.56      0.62       106

    accuracy                           0.70       242
   macro avg       0.70      0.68      0.68       242
weighted avg       0.70      0.70      0.69       242


Random Forest - Grid Search
Best parameters: {'clf__max_depth': 10, 'clf__min_samples_split': 2, 'clf__n_estimators': 300}
Accuracy: 0.731404958677686
              precision    recall  f1-score   support

           0       0.74      0.81      0.77       136
           1       0.72      0.63      0.67       106

    accuracy                           0.73       242
   macro avg       0.73      0.72      0.72       242
weighted avg       0.73      0.73      0.73       242


Logistic Regression - Grid Search
Best parameters: {'clf__C': 100, 'clf__sol

