In [3]:
# Data handling
import pandas as pd
import numpy as np
# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2
# Machine Learning Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Ignore warnings (optional)
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = r"D:\projects\PROJET-DATA-MINING\data2.csv"  
data = pd.read_csv(file_path)

# Display the first few rows
data.head()


Unnamed: 0,Profession,Income,Credit_card_number,Expiry,Security_code,Fraud
0,DOCTOR,42509,3515418493460774,07/25,251,1
1,DOCTOR,80334,213134223583196,05/32,858,1
2,LAWYER,91552,4869615013764888,03/30,755,1
3,LAWYER,43623,341063356109385,01/29,160,1
4,DOCTOR,22962,4707418777543978402,11/30,102,0


In [4]:
# Check dataset structure
print(data.info())

# Check for missing values
print(data.isnull().sum())

# View descriptive statistics
print(data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Profession          10000 non-null  object
 1   Income              10000 non-null  int64 
 2   Credit_card_number  10000 non-null  int64 
 3   Expiry              10000 non-null  object
 4   Security_code       10000 non-null  int64 
 5   Fraud               10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 468.9+ KB
None
Profession            0
Income                0
Credit_card_number    0
Expiry                0
Security_code         0
Fraud                 0
dtype: int64
            Income  Credit_card_number  Security_code         Fraud
count  10000.00000        1.000000e+04   10000.000000  10000.000000
mean   49761.20600        3.851363e+17     863.587800      0.501600
std    28837.72928        1.257950e+18    1484.424959      0.500022
min        1.00000     

In [7]:
# Drop rows with missing values
data = data.dropna()

# Alternatively, fill missing values
# data['Income'] = data['Income'].fillna(data['Income'].mean())

data = data.drop(['Credit_card_number', 'Expiry'], axis=1)

In [8]:
le = LabelEncoder()
data['Profession'] = le.fit_transform(data['Profession'])

In [10]:
scaler = MinMaxScaler()
data[['Income', 'Security_code']] = scaler.fit_transform(data[['Income', 'Security_code']])

In [11]:
X = data.drop('Fraud', axis=1)  # Features
y = data['Fraud']              # Target

In [12]:
selector = SelectKBest(score_func=chi2, k=5)  # Select top 5 features
X_new = selector.fit_transform(X, y)

# Print the selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['Profession', 'Income', 'Security_code'], dtype='object')


In [14]:
# Check if 'Description' column exists and contains text data
if 'Description' in data.columns:
    print(data['Description'].head())
else:
    print("No text column found for TF-IDF.")

No text column found for TF-IDF.


In [15]:
# Generate a description column programmatically
descriptions = [
    "Doctor with high income",
    "Engineer specialized in software",
    "Lawyer focused on corporate law",
    "Engineer in mechanical systems",
    "Doctor working in cardiology",
    "Engineer in AI research",
    "Lawyer specializing in criminal law",
    "Doctor with expertise in pediatrics",
    "Engineer handling robotics projects",
    "Lawyer working in public defense"
]

# Repeat descriptions to match the dataset length
data['Description'] = descriptions * (len(data) // len(descriptions)) + descriptions[:len(data) % len(descriptions)]

# Check the updated dataset
print(data.head())

   Profession    Income  Security_code  Fraud  \
0           0  0.425144       0.025125      1   
1           0  0.803451       0.085886      1   
2           2  0.915647       0.075576      1   
3           2  0.436285       0.016016      1   
4           0  0.229644       0.010210      0   

                        Description  
0           Doctor with high income  
1  Engineer specialized in software  
2   Lawyer focused on corporate law  
3    Engineer in mechanical systems  
4      Doctor working in cardiology  


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF to the new Description column
vectorizer = TfidfVectorizer(max_features=100)  # Extract top 100 features
tfidf_matrix = vectorizer.fit_transform(data['Description'])

# Convert TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original dataset
data = pd.concat([data, tfidf_df], axis=1)

# Drop the original Description column if no longer needed
data = data.drop('Description', axis=1)

# Check the updated dataset
print(data.head())

   Profession    Income  Security_code  Fraud   ai  cardiology  corporate  \
0           0  0.425144       0.025125      1  0.0    0.000000   0.000000   
1           0  0.803451       0.085886      1  0.0    0.000000   0.000000   
2           2  0.915647       0.075576      1  0.0    0.000000   0.495685   
3           2  0.436285       0.016016      1  0.0    0.000000   0.000000   
4           0  0.229644       0.010210      0  0.0    0.668337   0.000000   

   criminal  defense    doctor  ...  projects  public  research  robotics  \
0       0.0      0.0  0.380943  ...       0.0     0.0       0.0       0.0   
1       0.0      0.0  0.000000  ...       0.0     0.0       0.0       0.0   
2       0.0      0.0  0.000000  ...       0.0     0.0       0.0       0.0   
3       0.0      0.0  0.000000  ...       0.0     0.0       0.0       0.0   
4       0.0      0.0  0.446087  ...       0.0     0.0       0.0       0.0   

   software  specialized  specializing   systems      with   working  
0  