In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
data = pd.read_csv("raw_data.csv")
# print(data.columns)
if 'identifierHash' in data.columns:
    data.drop('identifierHash', axis=1, inplace=True)
if 'countryCode' in data.columns:
    data.drop('countryCode', axis=1, inplace=True)

#data.drop(['identifierHash','type','country','language','hasAnyApp','civilityTitle','civilityGenderId','seniorityAsMonths','seniorityAsYears','countryCode','productsWished','productsBought','hasAndroidApp','hasIosApp'],axis=1,inplace=True)
# Initialize encoders
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)
#print(data['gender'].unique())
converted = pd.get_dummies(data['gender'], drop_first=1)
data = pd.concat([data, converted], axis = 1)
data.drop('gender',axis = 1, inplace=True)
data.rename(columns={'M': 'Male'}, inplace=True)
#print(data['Male'].unique())
data = data.fillna(0)

# Check the data types of each column in the DataFrame
#print(data.dtypes)

# Identify numeric columns (int and float types)
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
boolean_columns = ['hasAnyApp', 'hasAndroidApp', 'hasIosApp', 'hasProfilePicture', 'Male']
# Handle non-numeric columns separately (e.g., keep them as they are, or convert them to category type)
# Example: Convert object-type columns to category type if necessary
object_columns = data.select_dtypes(include='object').columns
for col in object_columns:
    data[col] = data[col].astype('category')
# Convert boolean columns to integers
for col in boolean_columns:
    data[col] = data[col].astype(int)

# Convert only numeric columns to integer types
data[numeric_columns] = data[numeric_columns].astype(int)

categorical_columns = ['type', 'country', 'language']
# Convert categorical columns using one-hot encoding
encoded_data = one_hot_encoder.fit_transform(data[categorical_columns])

# Create a DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Combine the one-hot encoded columns with the original data, excluding the original categorical columns
data = pd.concat([data, encoded_df], axis=1).drop(categorical_columns, axis=1)

def zscore(array):
    thr = 3
    mean = np.mean(array)
    std = np.std(array)
    z_scores = (array - mean) / std
    return np.abs(z_scores) > thr


combined_condition = ~(zscore(data['socialNbFollows']) | zscore(data['socialNbFollowers']) | zscore(data['productsListed']) | zscore(data['productsSold']) | zscore(data['socialProductsLiked']))
data = data[combined_condition]
data.reset_index(drop=True, inplace=True)
#print(new_data.head())
#print(data.describe())


def pure_round(num):
    integer = int(num)
    fraction = num - float(integer)
    if fraction >= 0.5:
        integer += 1
    return integer

data = data[data['productsListed'] != 0]
for i in data.index:
    case_no = data.loc[i,'productsSold']
    pass_no = pure_round((case_no * data.loc[i,'productsPassRate']) / 100)
    fail_no = case_no - pass_no
    data.loc[i,'productsPassed'] = pass_no
    data.loc[i,'productsFailed'] = fail_no
    if case_no == 0:
        data.drop(i, axis=0, inplace=True)
 
data.drop(['productsPassRate','productsSold'], axis=1, inplace=True)

# Encodedict = {}
# for i in ['Male','hasProfilePicture']:
#     key = '_{}'.format(i)
#     le = LabelEncoder()
#     data[key] = le.fit_transform(list(data[i]))
#     Encodedict[key] = le.classes_

# data.drop(['Male','hasProfilePicture'], axis=1, inplace=True)

dfdict = {}
for j in data.index:
    x = data.loc[j,'productsPassed']
    y = data.loc[j,'productsFailed']
    if x != 0:
        data.loc[j,'Fraud'] = 0
        df = pd.DataFrame(data.loc[j,:]).transpose()
        ldf = pd.concat([df]*int(x), ignore_index=True)
    
    if y != 0:
        data.loc[j,'Fraud'] = 1
        df2 = pd.DataFrame(data.loc[j,:]).transpose()
        ldf2 = pd.concat([df2]*int(y), ignore_index=True)
    
    if x != 0 and y != 0:
        dfdict[j] = pd.concat([ldf, ldf2], ignore_index=True)
    elif x != 0:
        dfdict[j] = ldf
    else:
        dfdict[j] = ldf2

data_new = pd.concat(dfdict.values(), ignore_index=True)
data_new.drop(['productsPassed','productsFailed'], axis=1, inplace=True)
#print(data_new.describe())


from sklearn.model_selection import train_test_split

# Assuming 'Fraud' is your target variable
X = data_new.drop('Fraud', axis=1)  # Features
y = data_new['Fraud']  # Target variable
# Step 1: Convert the 'Fraud' column to numeric values, coercing any non-numeric values to NaN
data_new['Fraud'] = pd.to_numeric(data_new['Fraud'], errors='coerce')

# Step 2: Convert the 'Fraud' column to int64
data_new['Fraud'] = data_new['Fraud'].astype('int64')
#plt.figure(figsize=(4, 4))
#sns.countplot(x='productsListed', hue='Fraud', data=data_new, palette = 'inferno')
#sns.boxplot(x='productsListed', y='socialNbFollowers', hue='Fraud', data=data, palette = 'inferno')
#sns.histplot(x = 'seniority', data = data_new, binwidth = 350, hue = 'Fraud', element = 'step')
# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, you can reset index for train and test sets
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
#print(data_new.head())
# Ensure y_train contains only numeric values, and convert non-numeric values to NaN
y_train = pd.to_numeric(y_train, errors='coerce')

# Handle any missing values (NaN) as needed
# For example, you can drop rows with missing values or fill them with a default value:
y_train = y_train.dropna()  # Drop rows with NaN values

# Convert the series to integer type
y_train = y_train.astype(int)


X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
y_test = y_test.astype(int)
X_test.fillna(X_test.mean(), inplace=True)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (965, 223)
y_train shape: (965,)


In [41]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform

# Define hyperparameter distributions for each model
knn_param_distributions = {
    'knn__n_neighbors': randint(1, 21),
    'knn__weights': ['uniform', 'distance']
}

rf_param_distributions = {
    'rf__n_estimators': randint(10, 200),
    'rf__max_depth': randint(3, 30)
}

dt_param_distributions = {
    'dt__max_depth': randint(1, 30),
    'dt__min_samples_split': randint(2, 10),
    'dt__min_samples_leaf': randint(1, 10)
}

nb_param_distributions = {
    'nb__var_smoothing': uniform(1e-9, 1e-3)
}

# Create pipelines for each classifier
knn_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('knn', KNeighborsClassifier())
])

rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('rf', RandomForestClassifier())
])

dt_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('dt', DecisionTreeClassifier())
])

nb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('nb', GaussianNB())
])

# Set up RandomizedSearchCV for each model
knn_search = RandomizedSearchCV(
    estimator=knn_pipeline,
    param_distributions=knn_param_distributions,
    n_iter=10,  # Number of random samples to search
    cv=5,  # Number of cross-validation folds
    random_state=42
)

rf_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

dt_search = RandomizedSearchCV(
    estimator=dt_pipeline,
    param_distributions=dt_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

nb_search = RandomizedSearchCV(
    estimator=nb_pipeline,
    param_distributions=nb_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

# Fit each search object on the training data
knn_search.fit(X_train, y_train)
rf_search.fit(X_train, y_train)
dt_search.fit(X_train, y_train)
nb_search.fit(X_train, y_train)

# Make predictions on the test data for each model
y_pred_knn = knn_search.best_estimator_.predict(X_test)
y_pred_rf = rf_search.best_estimator_.predict(X_test)
y_pred_dt = dt_search.best_estimator_.predict(X_test)
y_pred_nb = nb_search.best_estimator_.predict(X_test)

# Calculate accuracy for each model
knn_accuracy = accuracy_score(y_test, y_pred_knn)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Print the best hyperparameters and accuracy for each model
print(f"Best parameters for KNeighborsClassifier: {knn_search.best_params_}")
print(f"Accuracy of KNeighborsClassifier: {knn_accuracy:.4f}")

print(f"Best parameters for RandomForestClassifier: {rf_search.best_params_}")
print(f"Accuracy of RandomForestClassifier: {rf_accuracy:.4f}")

print(f"Best parameters for DecisionTreeClassifier: {dt_search.best_params_}")
print(f"Accuracy of DecisionTreeClassifier: {dt_accuracy:.4f}")

print(f"Best parameters for GaussianNB: {nb_search.best_params_}")
print(f"Accuracy of GaussianNB: {nb_accuracy:.4f}")


Best parameters for KNeighborsClassifier: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}
Accuracy of KNeighborsClassifier: 0.7107
Best parameters for RandomForestClassifier: {'rf__max_depth': 23, 'rf__n_estimators': 11}
Accuracy of RandomForestClassifier: 0.6942
Best parameters for DecisionTreeClassifier: {'dt__max_depth': 28, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 2}
Accuracy of DecisionTreeClassifier: 0.6736
Best parameters for GaussianNB: {'nb__var_smoothing': 0.0009507153064099162}
Accuracy of GaussianNB: 0.4959
