In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = pd.read_csv("raw_data.csv")
#remove duplicate columns
if 'identifierHash' in data.columns:
    data.drop('identifierHash', axis=1, inplace=True)
if 'countryCode' in data.columns:
    data.drop('countryCode', axis=1, inplace=True)

# Identify numeric columns (int and float types)
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns

# Initialize encoders and perform hot encoding on Gender column
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)
converted = pd.get_dummies(data['gender'], drop_first=1)
data = pd.concat([data, converted], axis = 1)
data.drop('gender',axis = 1, inplace=True)
data.rename(columns={'M': 'Male'}, inplace=True)
data = data.fillna(0)

# Handle non-numeric columns separately (e.g., keep them as they are, or convert them to category type)
boolean_columns = ['hasAnyApp', 'hasAndroidApp', 'hasIosApp', 'hasProfilePicture', 'Male']
object_columns = data.select_dtypes(include='object').columns
for col in object_columns:
    data[col] = data[col].astype('category')
# Convert boolean columns to integers
for col in boolean_columns:
    data[col] = data[col].astype(int)

# Convert only numeric columns to integer types
data[numeric_columns] = data[numeric_columns].astype(int)

categorical_columns = ['type', 'country', 'language']
# Convert categorical columns using one-hot encoding
encoded_data = one_hot_encoder.fit_transform(data[categorical_columns])

# Create a DataFrame with one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Combine the one-hot encoded columns with the original data, excluding the original categorical columns
data = pd.concat([data, encoded_df], axis=1).drop(categorical_columns, axis=1)

def zscore(array):
    thr = 3
    mean = np.mean(array)
    std = np.std(array)
    z_scores = (array - mean) / std
    return np.abs(z_scores) > thr


combined_condition = ~(zscore(data['socialNbFollows']) | zscore(data['socialNbFollowers']) | zscore(data['productsListed']) | zscore(data['productsSold']) | zscore(data['socialProductsLiked']))
data = data[combined_condition]
data.reset_index(drop=True, inplace=True)

def pure_round(num):
    integer = int(num)
    fraction = num - float(integer)
    if fraction >= 0.5:
        integer += 1
    return integer

data = data[data['productsListed'] != 0]
for i in data.index:
    case_no = data.loc[i,'productsSold']
    pass_no = pure_round((case_no * data.loc[i,'productsPassRate']) / 100)
    fail_no = case_no - pass_no
    data.loc[i,'productsPassed'] = pass_no
    data.loc[i,'productsFailed'] = fail_no
    if case_no == 0:
        data.drop(i, axis=0, inplace=True)
 
data.drop(['productsPassRate','productsSold'], axis=1, inplace=True)
dfdict = {}
for j in data.index:
    x = data.loc[j,'productsPassed']
    y = data.loc[j,'productsFailed']
    if x != 0:
        data.loc[j,'Fraud'] = 0
        df = pd.DataFrame(data.loc[j,:]).transpose()
        ldf = pd.concat([df]*int(x), ignore_index=True)
    
    if y != 0:
        data.loc[j,'Fraud'] = 1
        df2 = pd.DataFrame(data.loc[j,:]).transpose()
        ldf2 = pd.concat([df2]*int(y), ignore_index=True)
    
    if x != 0 and y != 0:
        dfdict[j] = pd.concat([ldf, ldf2], ignore_index=True)
    elif x != 0:
        dfdict[j] = ldf
    else:
        dfdict[j] = ldf2

data_new = pd.concat(dfdict.values(), ignore_index=True)
data_new.drop(['productsPassed','productsFailed'], axis=1, inplace=True)
#print(data_new.describe())
#print(data_new.head(20))
#print(data_new.info())


from sklearn.model_selection import train_test_split

# Assuming 'Fraud' is your target variable
X = data_new.drop('Fraud', axis=1)  # Features
y = data_new['Fraud']  # Target variable
data_new['Fraud'] = pd.to_numeric(data_new['Fraud'], errors='coerce')
data_new['Fraud'] = data_new['Fraud'].astype('int64')

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = pd.to_numeric(y_train, errors='coerce')

# Handle any missing values (NaN) as needed
# For example, you can drop rows with missing values or fill them with a default value:
y_train = y_train.dropna()  # Drop rows with NaN values

# Convert the series to integer type
y_train = y_train.astype(int)


X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')
y_test = y_test.astype(int)
X_test.fillna(X_test.mean(), inplace=True)
print(X_train)

      socialNbFollowers  socialNbFollows  socialProductsLiked  productsListed  \
1137                  4                8                    0               1   
566                   8                8                    0               2   
277                   3                8                    0               4   
803                   7               10                   19               5   
909                   4                8                    0               2   
...                 ...              ...                  ...             ...   
1044                  6                9                    2               1   
1095                  4                8                    0               1   
1130                  4                8                    2               1   
860                   5                8                    0               1   
1126                  4                8                    0               1   

      productsWished  produ

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import classification_report


In [9]:
# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define transformers for numeric and categorical data
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [19]:

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define a second pipeline without PCA for comparison
pipeline_original_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the pipeline without PCA on the training data
pipeline_original_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_original_lr = pipeline_original_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_original_lr = accuracy_score(y_test, y_pred_original_lr)
report_original_lr = classification_report(y_test, y_pred_original_lr)

print("\nLogistic Regression without PCA:")
print("Accuracy:", accuracy_original_lr)
print(report_original_lr)
# Define the regularization type (penalty) and regularization strength (C)
# 'l1' for L1 regularization and 'l2' for L2 regularization
penalty_type = 'l1'  # Change to 'l1' if you want L1 regularization
C_value =0.5  # Regularization strength; you can experiment with different values

# Define the logistic regression model with the appropriate solver for L1 regularization
solver_type = 'liblinear' if penalty_type == 'l1' else 'lbfgs'

# Create a pipeline without PCA, including preprocessing and logistic regression
pipeline_no_pca_lr = Pipeline([
    ('preprocessor', preprocessor),  # Column transformer for data preprocessing
    ('scaler', StandardScaler()),  # Standard scaling (optional but recommended)
    ('clf', LogisticRegression(
        penalty=penalty_type,
        C=C_value,
        solver=solver_type,
        max_iter=1000,
        random_state=42  # Seed for reproducibility
    ))
])

# Train the pipeline with logistic regression (no PCA) on the training data
pipeline_no_pca_lr.fit(X_train, y_train)

# Predict on the testing data using the pipeline
y_pred_no_pca_lr = pipeline_no_pca_lr.predict(X_test)

# Calculate accuracy and print classification report
accuracy_no_pca_lr = accuracy_score(y_test, y_pred_no_pca_lr)
report_no_pca_lr = classification_report(y_test, y_pred_no_pca_lr)

print("\nLogistic Regression without PCA but with regularization:")
print("Accuracy:", accuracy_no_pca_lr)
print(report_no_pca_lr)



Logistic Regression without PCA:
Accuracy: 0.6157024793388429
              precision    recall  f1-score   support

         0.0       0.63      0.79      0.70       136
         1.0       0.59      0.40      0.47       106

    accuracy                           0.62       242
   macro avg       0.61      0.59      0.59       242
weighted avg       0.61      0.62      0.60       242


Logistic Regression without PCA but with regularization:
Accuracy: 0.6322314049586777
              precision    recall  f1-score   support

         0.0       0.63      0.82      0.71       136
         1.0       0.63      0.40      0.49       106

    accuracy                           0.63       242
   macro avg       0.63      0.61      0.60       242
weighted avg       0.63      0.63      0.61       242



In [20]:
# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print("Logistic Regression model:\n", report)


Logistic Regression model:
               precision    recall  f1-score   support

         0.0       0.63      0.80      0.71       136
         1.0       0.61      0.40      0.48       106

    accuracy                           0.62       242
   macro avg       0.62      0.60      0.59       242
weighted avg       0.62      0.62      0.61       242



In [21]:
# creating a RF classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100) 

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# metrics are used to find accuracy or error
from sklearn import metrics 
print()

# using metrics module for accuracy calculation
report = classification_report(y_test, y_pred)

# Print the classification report
print("Random Forest:\n", report)



Random Forest:
               precision    recall  f1-score   support

         0.0       0.77      0.77      0.77       136
         1.0       0.70      0.70      0.70       106

    accuracy                           0.74       242
   macro avg       0.74      0.74      0.74       242
weighted avg       0.74      0.74      0.74       242



In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
from sklearn import metrics 
print()

# using metrics module for accuracy calculation
#print("ACCURACY OF THE MODEL:", metrics.accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred)

# Print the classification report
print("KNN:\n", report)


KNN:
               precision    recall  f1-score   support

         0.0       0.77      0.77      0.77       136
         1.0       0.70      0.70      0.70       106

    accuracy                           0.74       242
   macro avg       0.74      0.74      0.74       242
weighted avg       0.74      0.74      0.74       242



In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the classification report
print("Naive Bayes:\n", report)

Naive Bayes:
               precision    recall  f1-score   support

         0.0       0.62      0.81      0.70       136
         1.0       0.59      0.36      0.45       106

    accuracy                           0.61       242
   macro avg       0.61      0.58      0.57       242
weighted avg       0.61      0.61      0.59       242



In [24]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=301,
                                 learning_rate=0.02,
                                 random_state=100,
                                 max_features=7 )
# Fit to training set
gbc.fit(X_train,y_train)
 
# Predict on test set
pred_y = gbc.predict(X_test)
 
# accuracy
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.62      0.81      0.70       136
         1.0       0.59      0.36      0.45       106

    accuracy                           0.61       242
   macro avg       0.61      0.58      0.57       242
weighted avg       0.61      0.61      0.59       242

