In [73]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [99]:
file_path = Path("Resources/cleaned_cancer_data.csv")
lung_cancer_prediction_df = pd.read_csv(file_path)
lung_cancer_prediction_df = lung_cancer_prediction_df[lung_cancer_prediction_df['Overall Survival Status'] != '0']
lung_cancer_prediction_df

Unnamed: 0.1,Unnamed: 0,Diagnosis Age,Fraction Genome Altered,Mutation Count,Overall Survival Status,TMB (nonsynonymous),Cancer Type Detailed,Prior Cancer Diagnosis Occurence,Smoking History,Sex,Person Cigarette Smoking History Pack Year Value
159,159,70,0.4565,189,0:LIVING,6.300000,1,0,1,0,38.0
160,160,67,0.2221,288,0:LIVING,9.633333,1,0,1,0,52.0
161,161,79,0.2362,296,1:DECEASED,9.833333,1,0,1,1,47.0
162,162,68,0.0854,1625,0:LIVING,54.233333,1,1,1,0,62.0
163,163,66,0.0661,122,0:LIVING,4.066667,1,1,1,0,20.0
...,...,...,...,...,...,...,...,...,...,...,...
1139,1139,75,0.2382,211,1:DECEASED,7.033333,0,0,1,1,1.0
1140,1140,63,0.5420,101,1:DECEASED,3.400000,0,1,1,0,2.5
1141,1141,71,0.4405,216,1:DECEASED,7.200000,0,0,1,1,2.5
1142,1142,68,0.0598,109,0:LIVING,3.633333,1,0,1,1,95.0


In [113]:
target = lung_cancer_prediction_df["Overall Survival Status"]
target_names = ["Living","Deceased"]

In [103]:
model_data = lung_cancer_prediction_df.drop("Overall Survival Status",axis=1)
feature_names = model_data.columns

In [105]:
X_train, X_test, y_train, y_test = train_test_split(model_data, target)

In [107]:
# Import the KMeans module from SKLearn
from sklearn.svm import SVC
svc_model = SVC(kernel='linear')
svc_model.fit(X_train,y_train)

In [115]:
print("Test Acc: %.3f" % svc_model.score(X_test,y_test))

Test Acc: 0.715


In [117]:
predictions = svc_model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

      Living       0.72      1.00      0.83       176
    Deceased       0.00      0.00      0.00        70

    accuracy                           0.72       246
   macro avg       0.36      0.50      0.42       246
weighted avg       0.51      0.72      0.60       246



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random Forest Classifier

In [119]:
X = lung_cancer_prediction_df.copy()
X.drop("Overall Survival Status",axis=1,inplace=True)
y = lung_cancer_prediction_df["Overall Survival Status"]

In [121]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [123]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [125]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [127]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [129]:
# Create a random forest classifier
model = RandomForestClassifier(n_estimators=300, random_state=78)
# Model fitting
model = model.fit(X_train_scaled, y_train)

In [131]:
rf_predictions = model.predict(X_test_scaled)

In [66]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [133]:
print("Classification Report\n")
print(f"Accuracy score: {acc_score}")
print(classification_report(y_test,rf_predictions))

Classification Report

Accuracy score: 0.7027972027972028
              precision    recall  f1-score   support

    0:LIVING       0.77      0.94      0.85       175
  1:DECEASED       0.70      0.32      0.44        71

    accuracy                           0.76       246
   macro avg       0.74      0.63      0.65       246
weighted avg       0.75      0.76      0.73       246



In [135]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [137]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [139]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [141]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    0:LIVING       0.75      0.83      0.79       175
  1:DECEASED       0.43      0.31      0.36        71

    accuracy                           0.68       246
   macro avg       0.59      0.57      0.57       246
weighted avg       0.66      0.68      0.67       246



In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
## Decision tree model prediction

In [147]:
# Importing tree classifier
from sklearn import tree
# Creating the decision tree classifier instance
tree_model = tree.DecisionTreeClassifier()
# Fitting the model
tree_model = tree_model.fit(X_train_scaled, y_train)
# Making predictions
predictions = tree_model.predict(X_test_scaled)

In [149]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

Accuracy Score: 0.7398373983739838
              precision    recall  f1-score   support

    0:LIVING       0.84      0.79      0.81       175
  1:DECEASED       0.54      0.62      0.58        71

    accuracy                           0.74       246
   macro avg       0.69      0.70      0.70       246
weighted avg       0.75      0.74      0.74       246



## Logistic Regression Model

In [153]:
# Import logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',random_state=1)
classifier

In [155]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [157]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.720108695652174
Testing Data Score: 0.7195121951219512


In [165]:
# Predicting outcomes for testing
predictions = classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.7195121951219512

In [172]:
lung_cancer_prediction_df.to_csv("Resources/cleaned_cancer_data.csv")