In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual file name

# Step 1: Data Preprocessing

# Handle missing values (replace '?' with NaN)
data.replace('?', np.nan, inplace=True)

# Encode categorical features
categorical_cols = ['V2', 'V3', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13']
for col in categorical_cols:
    data[col] = data[col].astype('category')

# Encode the 'Class' column
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

# Handle missing values (impute with appropriate strategy, e.g., most frequent value)
data.fillna(data.mode().iloc[0], inplace=True)

# Step 2: Data Exploration and Analysis

# Explore the dataset (e.g., check for missing values, data statistics)
print(data.info())
print(data.describe())

# Visualize the data
sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

# Countplot for the 'Class' distribution
plt.subplot(1, 2, 1)
sns.countplot(x='Class', data=data)
plt.title('Class Distribution')

# Pairplot for numerical columns
num_cols = ['V4']
data[num_cols] = data[num_cols].apply(pd.to_numeric, errors='coerce')
data = data.dropna(subset=num_cols)
plt.subplot(1, 2, 2)
sns.pairplot(data, hue='Class', vars=num_cols)
plt.title('Pairplot of Numerical Features')

plt.show()

# Step 3: Model Building

# Split the data into training and testing sets
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classification algorithm (Random Forest in this example)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(report)
