In [1]:
import numpy as np
import pandas as pd
import joblib  # For saving the model, scaler, and features
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
df = pd.read_csv('survey lung cancer.csv')

# Display initial data and statistics
print(df.head())
print(df.describe())
print(df.isna().sum())
print(df.duplicated().sum())

# Drop duplicates
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())
print(df.isna().sum())
print(df.info())

# Map 'GENDER' and 'LUNG_CANCER' to numeric values
df['GENDER'] = df['GENDER'].replace({'M': 1, 'F': 2})
df['LUNG_CANCER'] = df['LUNG_CANCER'].replace({'YES': 1, "NO": 2})

# Convert all other columns to numeric, coercing errors to NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values if necessary
df.dropna(inplace=True)

# Define features and target
X = df.drop(columns=['LUNG_CANCER'])  # Features
y = df['LUNG_CANCER']  # Target

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply random oversampling to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y)

# Split the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create and train the K-Means model
kmeans_model = KMeans(n_clusters=2, random_state=42)  # Using 2 clusters since there are two categories (YES/NO for lung cancer)
kmeans_model.fit(X_train)

# Make predictions on the test data
y_pred = kmeans_model.predict(X_test)

# Evaluate the model
y_pred_mapped = np.where(y_pred == 0, 1, 2)  # Map cluster labels to match original labels
accuracy = accuracy_score(y_test, y_pred_mapped)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_mapped))

# Save the KMeans model, scaler, and features

# Save the model
joblib.dump(kmeans_model, 'kmeans_lung_cancer_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Save the feature names
joblib.dump(X.columns, 'features.pkl')

print("Model, scaler, and features saved successfully.")


  GENDER AGE SMOKING YELLOW FINGERS ANXIETY PEER PRESSURE CHRONIC DISEASE  \
0      M  69       1              2       2             1               1   
1      M  74       2              1       1             1               2   
2      F  59       1              1       1             2               1   
3      M  63       2              2       2             1               1   
4      F  63       1              2       1             1               1   

  FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH  \
0       2       1        2                 2        2                   2   
1       2       2        1                 1        1                   2   
2       2       1        2                 1        2                   2   
3       1       1        1                 2        1                   1   
4       1       1        2                 1        2                   2   

  SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER  
0                     2  