# Lung Cancer Prediction

Problem Statement: What Causes Lung Cancer and how should it be treated 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

Here we will be experimenting with 3 algorithms 
1. KNeighborsClassifier
2. DecisionTreeClassifier
3. RandomForestClassifier
4.Logistic Regression


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_excel('cancer_patient_data_sets.xlsx')

In [None]:
df.info()


In [None]:
df.isnull().sum()

In [None]:
df.groupby('Level').describe()

In [None]:
df['Level'].value_counts()


In [None]:
df['Gender'].value_counts()

In [None]:

df['Level'] = df['Level'].replace({'High': 2, 'Medium': 1, 'Low': 0})




In [None]:
df.head()

In [None]:
df.Level

In [None]:
df.head()

In [None]:
corrmat= df.corr

In [None]:
corrmat

In [None]:
df=df.drop('Patient Id', axis =1)

In [None]:
import seaborn as sns
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn"
            ,cbar_kws={'label':'Correlation Coefficient'})

plt.title('Correlation Matrix')
plt.savefig('correlation.png')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_excel("cancer_patient_data_sets.xlsx")

# Map Level to numeric values for correlation
df['Level_num'] = df['Level'].map({'Low': 1, 'Medium': 2, 'High': 3})

# Compute correlation with target column "Level_num"
corr_with_level = df.corr(numeric_only=True)['Level_num'].drop('Level_num').sort_values(ascending=False)

# Print correlations
print("Correlation of each column with Level:\n")
print(corr_with_level)

# Plot correlations as a bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x=corr_with_level.values, y=corr_with_level.index, palette="RdYlGn")
plt.title("Correlation of Each Feature with Cancer Level (Low, Medium, High)")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Drop non-numeric or unrelated columns if needed
features = df.columns.drop('Level')  # Assuming 'Level' is the target
num_cols = 4
num_rows = -(-len(features) // num_cols)  

plt.figure(figsize=(24, num_rows * 4))

for i, col in enumerate(features):
    plt.subplot(num_rows, num_cols, i + 1)
    sns.histplot(data=df, x=col, hue='Level', multiple='stack', palette='viridis')
    plt.title(f'{col} by Level')
    plt.xlabel(col)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()


Plotted a Histogram where levels(low,medium,high) with each feature 
and count at the Y axis

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Level' , data=df, palette='RdBu_r')

Importing train test split to separate the training data and the test data 


Importing train test split and using 70% for training and 30% for testing


In [None]:

X= df.drop(['Level'], axis=1)
y= df['Level']


Importing train test Split where 30% data is for testing and 70% for training

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your feature matrix and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
print(X_train.dtypes)


In [None]:


# Drop patient id from features
X = df.drop(columns=["Patient Id"])   # replace with exact column name
                      # example target col

# Then split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X.shape

In [None]:
y.shape

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

# Check the shape of X_test and y_test
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

I have implemented few Machine Learning Models to check which gives highest Score 

In [None]:


plt.figure(figsize=(10,6))
sns.set_style('whitegrid')
sns.set_palette('Set2')


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    
}

# === Model evaluation ===
results = []

for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')

    results.append({
        'Model': name,
        'Accuracy': acc,
        'F1-score': f1,
        'Recall': recall,
        'Precision': precision
    })

# === Final results as DataFrame sorted by Accuracy ===
results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
results_df

PLOTTING FOR CONFUSION MATRIX FOR EACH ALGORITHM

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Plot confusion matrix for each model
for name, model in models.items():
    print(f"\nConfusion Matrix: {name}")
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    
    # Plot
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.grid(False)
    plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Store cross-validation scores for each k
knn_scores = []

# Try k values from 1 to 30
for k in range(1, 31):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_classifier, X, y, cv=3)
    knn_scores.append(scores.mean())

# Find the best k
best_k = knn_scores.index(max(knn_scores)) + 1  # +1 because range starts at 1

print(f"Best k: {best_k}")
print(f"Best cross-validation score: {max(knn_scores):.4f}")

# Optional: Plotting k vs CV accuracy
plt.figure(figsize=(10, 5))
plt.plot(range(1, 31), knn_scores, marker='o')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Cross-Validated Accuracy')
plt.title('KNN: Number of Neighbors vs Accuracy')
plt.xticks(range(1, 31))
plt.grid(True)
plt.show()


By performing Cross Validation It has given n_neighbours value which I have used in the above Code


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


y= df['Level']
X= df.drop(['Level'], axis=1)

# 2. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Initialize and train KNN classifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

# 4. Predict on test set here test (Levels)
y_pred = knn.predict(X_test)

# EVALUATION 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'macro' works for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Optional: detailed report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:

plt.figure(figsize=(12, 6)) 

# Plot the KNN scores
plt.plot(range(1, 31), knn_scores, color='red', marker='o', linestyle='-')

# taking 5 points to reduce clutter 
top_k = sorted(enumerate(knn_scores, start=1), key=lambda x: x[1], reverse=True)[:5]
for k, score in top_k:
    plt.text(k, score + 0.001, f'({k}, {score:.3f})', ha='center', fontsize=9, color='blue')


plt.xticks(range(1, 31))
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Accuracy Score')
plt.title('K Neighbour Classifier Scores for Different K Values')
plt.grid(True)
plt.tight_layout()
plt.show()


Decision Tree classifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

decision_tree_classifier = DecisionTreeClassifier()  # No n_estimators here
score = cross_val_score(decision_tree_classifier, X, y, cv=5)
print("Cross-validation score:", score.mean())


In [None]:
score.mean()

By using this dataset ,
By plotting the correlation matrix 
I could notice that the main cause of Lung Cancer is becoz of Caughing of Blood and Obesity