In [1]:
# Set dependencies
import pandas as pd
  # Import the train_test_learn module
from sklearn.model_selection import train_test_split
  # Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Logistic Regression Model

In [2]:
# Read the TSV file from the Resources folder into a Pandas DataFrame
lung_cancer_data_df = pd.read_csv('../Resources/nsclc_tcga_broad_2016_clinical_data.tsv', sep='\t')
lung_cancer_data_df.head()

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,Age At Surgery,Cancer Type,Cancer Type Detailed,Death from Initial Pathologic Diagnosis Date,Days to Last Followup,Fraction Genome Altered,...,Overall Survival (Months),Overall Survival Status,Number of Samples Per Patient,Sex,Smoking History,Person Cigarette Smoking History Pack Year Value,Somatic Status,Stage,TMB (nonsynonymous),T Stage
0,nsclc_tcga_broad_2016,LUAD-2GUGK,LUAD-2GUGK-Tumor,,68.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0174,...,,,1,Female,"Current Reformed Smoker, Duration Not Specified",30.0,Matched,IIA,7.933333,
1,nsclc_tcga_broad_2016,LUAD-5O6B5,LUAD-5O6B5-Tumor,,56.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0356,...,,,1,Female,Lifelong Non-Smoker,0.0,Matched,IB,1.033333,
2,nsclc_tcga_broad_2016,LUAD-5V8LT,LUAD-5V8LT-Tumor,,52.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.3895,...,,,1,Male,"Current Reformed Smoker, Duration Not Specified",6.0,Matched,IB,49.433333,
3,nsclc_tcga_broad_2016,LUAD-74TBW,LUAD-74TBW-Tumor,,73.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.1757,...,,,1,Male,"Current Reformed Smoker, Duration Not Specified",61.25,Matched,IB,7.8,
4,nsclc_tcga_broad_2016,LUAD-AEIUF,LUAD-AEIUF-Tumor,,60.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.1942,...,,,1,Female,"Current Reformed Smoker, Duration Not Specified",73.5,Matched,,5.333333,


In [3]:
# Set outputs to boolean (i.e. )
lung_cancer_data_df['Cancer Type Detailed'] = lung_cancer_data_df['Cancer Type Detailed'].replace(
    {'Lung Adenocarcinoma': 0,
     'Lung Squamous Cell Carcinoma': 1})

lung_cancer_data_df['Prior Cancer Diagnosis Occurence'] = lung_cancer_data_df['Prior Cancer Diagnosis Occurence'].replace(
    {'No': 1,
     'Yes, history of prior malignancy': 0,
     'Yes, history of synchronous and or bilateral malignancy': 0})

lung_cancer_data_df['Sex'] = lung_cancer_data_df['Sex'].replace(
    {'Male': 0,
     'Female': 1
     })

lung_cancer_data_df['Smoking History'] = lung_cancer_data_df['Smoking History'].replace(
    {'Lifelong Non-Smoker': 0,
     'Current Smoker': 1,
     'Current Reformed Smoker For < Or = 15 Years': 1,
     'Current Reformed Smoker For > 15 Years':1,
     'Current Reformed Smoker, Duration Not Specified':1})

  lung_cancer_data_df['Cancer Type Detailed'] = lung_cancer_data_df['Cancer Type Detailed'].replace(
  lung_cancer_data_df['Prior Cancer Diagnosis Occurence'] = lung_cancer_data_df['Prior Cancer Diagnosis Occurence'].replace(
  lung_cancer_data_df['Sex'] = lung_cancer_data_df['Sex'].replace(
  lung_cancer_data_df['Smoking History'] = lung_cancer_data_df['Smoking History'].replace(


In [4]:
# Build DF for all float value columns
lung_cancer_data_df_new = lung_cancer_data_df[['Diagnosis Age',
                                               'Fraction Genome Altered',
                                               'Mutation Count',
                                               'Overall Survival Status',
                                               'TMB (nonsynonymous)',
                                               'Cancer Type Detailed',
                                               'Prior Cancer Diagnosis Occurence',
                                               'Smoking History',
                                               'Sex',
                                               'Person Cigarette Smoking History Pack Year Value'
                                            ]]


# Drop rows with NA values in 'Overall Survival Status'
lung_cancer_data_df_cleaned = lung_cancer_data_df_new.dropna()
lung_cancer_data_df_cleaned

Unnamed: 0,Diagnosis Age,Fraction Genome Altered,Mutation Count,Overall Survival Status,TMB (nonsynonymous),Cancer Type Detailed,Prior Cancer Diagnosis Occurence,Smoking History,Sex,Person Cigarette Smoking History Pack Year Value
159,70.0,0.4565,189,0:LIVING,6.300000,0,1.0,1.0,0.0,38.0
160,67.0,0.2221,288,0:LIVING,9.633333,0,1.0,1.0,0.0,52.0
161,79.0,0.2362,296,1:DECEASED,9.833333,0,1.0,1.0,1.0,47.0
162,68.0,0.0854,1625,0:LIVING,54.233333,0,0.0,1.0,0.0,62.0
163,66.0,0.0661,122,0:LIVING,4.066667,0,0.0,1.0,0.0,20.0
...,...,...,...,...,...,...,...,...,...,...
1139,75.0,0.2382,211,1:DECEASED,7.033333,1,1.0,1.0,1.0,1.0
1140,63.0,0.5420,101,1:DECEASED,3.400000,1,0.0,1.0,0.0,2.5
1141,71.0,0.4405,216,1:DECEASED,7.200000,1,1.0,1.0,1.0,2.5
1142,68.0,0.0598,109,0:LIVING,3.633333,0,1.0,1.0,1.0,95.0


In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lung_cancer_data_df_cleaned['Overall Survival Status']

# Separate the X variable, the features
X = lung_cancer_data_df_cleaned.drop(columns='Overall Survival Status')


In [6]:
# Review the y variable Series
y.head(10)

159      0:LIVING
160      0:LIVING
161    1:DECEASED
162      0:LIVING
163      0:LIVING
164      0:LIVING
165      0:LIVING
167    1:DECEASED
168    1:DECEASED
169      0:LIVING
Name: Overall Survival Status, dtype: object

In [7]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Diagnosis Age,Fraction Genome Altered,Mutation Count,TMB (nonsynonymous),Cancer Type Detailed,Prior Cancer Diagnosis Occurence,Smoking History,Sex,Person Cigarette Smoking History Pack Year Value
159,70.0,0.4565,189,6.3,0,1.0,1.0,0.0,38.0
160,67.0,0.2221,288,9.633333,0,1.0,1.0,0.0,52.0
161,79.0,0.2362,296,9.833333,0,1.0,1.0,1.0,47.0
162,68.0,0.0854,1625,54.233333,0,0.0,1.0,0.0,62.0
163,66.0,0.0661,122,4.066667,0,0.0,1.0,0.0,20.0


In [8]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1200,
                                random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

In [10]:
# Make a prediction using the testing data
test_predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": test_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0:LIVING,1:DECEASED
1,0:LIVING,0:LIVING
2,0:LIVING,0:LIVING
3,0:LIVING,1:DECEASED
4,0:LIVING,0:LIVING
5,0:LIVING,0:LIVING
6,0:LIVING,0:LIVING
7,0:LIVING,1:DECEASED
8,0:LIVING,0:LIVING
9,0:LIVING,1:DECEASED


In [11]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, test_predictions)

In [12]:
# Print the classification report for the model
print(test_matrix)

# Create and save the testing classification report
testing_report = classification_report(y_test, test_predictions)

# Print the testing classification report
print(testing_report)

[[121   0]
 [ 61   0]]
              precision    recall  f1-score   support

    0:LIVING       0.66      1.00      0.80       121
  1:DECEASED       0.00      0.00      0.00        61

    accuracy                           0.66       182
   macro avg       0.33      0.50      0.40       182
weighted avg       0.44      0.66      0.53       182



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Precision "0:LIVING" / "1:DECEASED" scores when run with....

All factors:                        0.66 / 0.0

ONLY: 
Diagnosis Age:                      0.69 / 0.0

Smoking History:                    0.71 / 0.0

Fraction Genome Altered:            0.70 / 0.0

Mutation Count:                     0.70 / 0.0

TMB (nonsynonymous):                0.70 / 0.0

Cancer Type Detailed:               0.70 / 0.0

Prior Cancer Diagnosis Occurence:   0.74 / 0.0

Sex:                                0.70 / 0.0

Person Cigarette Smoking History Pack Year Value: 0.65 / 0.0

### Combo of

Smoking History & Prior Cancer Diagnosis Occurence: 0.74 / 0.0

### Summary
The Logistic Regression model was built upon the lung cancer data set in order to predict the 'Overall Survival Status' results.  Factors in the dataset was cleaned for any float value columns for any null values and any columns with binary string outputs were replaced with 1s & 0s, accordingly.

In using the solver method of LBFGS and setting the maximum number of iterations to 1,200, the model results were as follows:

![Logistic Regression Results](../Images/LogisticRegressionResults.png)

The logistic regression model provides for a fair means of predicting the values for the surviving population (i.e. class labeled as '0') as the precision shown is reported at 0.66, showing that the model predicted positively approximately 2 out of 3 assessments.  The recall reported for the prediction of surviving population is 1.00, which shows that the prediction for the Living Survival was correct at predicting an actual survivor 100% of the time.

As for the model's ability to predict the values for the non-surviving population (i.e. class labeled as '1'), the model did not perform well at all, as the precision and the recall is reported at 0%.

As the results for the model were lower than desired, several runs of the model were performed to only use one factor at a time.  With several iterations, the analysis provided that the two factors of 1) Smoking History and 2) Prior Cancer Diagnosis Occurence allowed for a higher precision result of 0.74 for the survival prediction, although no benefits were observed for the non-survival prediction.

Overall, the model performed fairly for predicting survivors, although the model would most likely perform better with a significantly larger dataset than the one we sourced (i.e. a dataset with significantly more than 1200 patients).