### loading the packages

In [1]:
import pandas as pd
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [240]:
csv_file_path = r'./training_wids2024C1.csv'


df = pd.read_csv(csv_file_path)


In [177]:
test_file_path = r'./test.csv'

test_df = pd.read_csv(test_file_path)

In [241]:
df.head()


Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
0,475714,,MEDICAID,CA,924,84,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,12.871429,22.542857,10.1,27.814286,11.2,3.5,52.23721,8.650555,18.606528,1
1,349367,White,COMMERCIAL,CA,928,62,F,28.49,C50411,Malig neoplm of upper-outer quadrant of right ...,...,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,42.301121,8.487175,20.113179,1
2,138632,White,COMMERCIAL,TX,760,43,F,38.09,C50112,Malignant neoplasm of central portion of left ...,...,11.253333,9.663333,3.356667,31.394915,15.066667,7.446667,40.108207,7.642753,14.839351,1
3,617843,White,COMMERCIAL,CA,926,45,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,0
4,817482,,COMMERCIAL,ID,836,55,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,15.276,11.224,1.946,26.170213,12.088,13.106,41.356058,4.110749,11.722197,0


### checking the null values

In [217]:
pd.set_option('display.max_rows', None)


nullcounts = df.isnull().sum()
null_counts_filtered = nullcounts[nullcounts <10]

# Print the filtered result
print(null_counts_filtered)

pd.reset_option('display.max_rows')


patient_id                          0
patient_zip3                        0
patient_age                         0
patient_gender                      0
breast_cancer_diagnosis_code        0
breast_cancer_diagnosis_desc        0
metastatic_cancer_diagnosis_code    0
population                          1
density                             1
age_median                          1
age_under_10                        1
age_10_to_19                        1
age_20s                             1
age_30s                             1
age_40s                             1
age_50s                             1
age_60s                             1
age_70s                             1
age_over_80                         1
male                                1
female                              1
married                             1
divorced                            1
never_married                       1
widowed                             1
family_size                         4
family_dual_

In [25]:
df.shape

(12906, 83)

### Data Cleaning

dropping patient_gender as it is not a useful feature and droppping all the metrics such as household income and such as it might not be an important feature for now

In [242]:
columns_to_sample = ['patient_id', 'patient_race', 'payer_type', 'patient_state', 'patient_age',
                     'breast_cancer_diagnosis_code','patient_gender', 'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment',
                      'Region', 'Division', 'population','density', 'Ozone','PM25', 'N02', 'DiagPeriodL90D']


sampled_df = df[columns_to_sample]


In [243]:
sampled_df.shape

(12906, 17)

In [244]:
sampled_df.isnull().sum()

patient_id                              0
patient_race                         6385
payer_type                           1803
patient_state                          51
patient_age                             0
breast_cancer_diagnosis_code            0
patient_gender                          0
metastatic_cancer_diagnosis_code        0
metastatic_first_novel_treatment    12882
Region                                 52
Division                               52
population                              1
density                                 1
Ozone                                  29
PM25                                   29
N02                                    29
DiagPeriodL90D                          0
dtype: int64

### preprocessing rule: drop the columns with more than 30% missing values in this case we drop patient race, metastatic_first novel treatment

In [245]:
sampled_df.drop('patient_race', axis = 1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df.drop('patient_race', axis = 1,inplace=True)


In [246]:
sampled_df.drop('patient_gender', axis = 1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df.drop('patient_gender', axis = 1,inplace=True)


In [247]:
sampled_df['payer_type'].fillna('UNKNOWN', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df['payer_type'].fillna('UNKNOWN', inplace=True)


In [248]:
payer_type_counts = sampled_df.groupby('payer_type').size()

# Print the result
print(sum(payer_type_counts))
print(payer_type_counts)

12906
payer_type
COMMERCIAL            6032
MEDICAID              2569
MEDICARE ADVANTAGE    2502
UNKNOWN               1803
dtype: int64


In [249]:
non_null_values = df['metastatic_first_novel_treatment'].dropna()

# Print non-null values
print(len(non_null_values))

24


In [250]:
#since the values are so less we have to drop this column too 
sampled_df.drop('metastatic_first_novel_treatment', axis = 1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df.drop('metastatic_first_novel_treatment', axis = 1,inplace=True)


In [108]:
rows_with_nan = sampled_df[sampled_df['patient_state'].isnull()]

# Print rows with NaN values in the 'payer_type' column
print(rows_with_nan[:10])

      patient_id          payer_type patient_state  patient_age  \
8         994014  MEDICARE ADVANTAGE           NaN           82   
33        744114             UNKNOWN           NaN           83   
316       178547             UNKNOWN           NaN           39   
879       472026             UNKNOWN           NaN           79   
912       333582            MEDICAID           NaN           66   
1180      265749             UNKNOWN           NaN           58   
1271      491813             UNKNOWN           NaN           63   
1292      988642            MEDICAID           NaN           66   
1878      378130             UNKNOWN           NaN           63   
2406      160039            MEDICAID           NaN           61   

     breast_cancer_diagnosis_code patient_gender  \
8                            1744              F   
33                         C50111              F   
316                        C50912              F   
879                        C50911              F   
91

In [251]:
sampled_df.isnull().sum()

patient_id                           0
payer_type                           0
patient_state                       51
patient_age                          0
breast_cancer_diagnosis_code         0
metastatic_cancer_diagnosis_code     0
Region                              52
Division                            52
population                           1
density                              1
Ozone                               29
PM25                                29
N02                                 29
DiagPeriodL90D                       0
dtype: int64

### it looks like the division and region that were appended didnt know the patient state so they couldnt add their regiion division hence region division and sate are collective null and we can drop these as we cant add missing values to them 

In [252]:
df_without_nulls = sampled_df.dropna(subset=['patient_state'])

In [253]:
df_without_nulls.isnull().sum()

patient_id                           0
payer_type                           0
patient_state                        0
patient_age                          0
breast_cancer_diagnosis_code         0
metastatic_cancer_diagnosis_code     0
Region                               1
Division                             1
population                           1
density                              1
Ozone                               28
PM25                                28
N02                                 28
DiagPeriodL90D                       0
dtype: int64

In [254]:
sampled_df = sampled_df.dropna(subset=['patient_state','Region','Division','population','density','Ozone','PM25','N02'])

In [255]:
sampled_df.isnull().sum()

patient_id                          0
payer_type                          0
patient_state                       0
patient_age                         0
breast_cancer_diagnosis_code        0
metastatic_cancer_diagnosis_code    0
Region                              0
Division                            0
population                          0
density                             0
Ozone                               0
PM25                                0
N02                                 0
DiagPeriodL90D                      0
dtype: int64

In [256]:
sampled_df.head()

Unnamed: 0,patient_id,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,Region,Division,population,density,Ozone,PM25,N02,DiagPeriodL90D
0,475714,MEDICAID,CA,84,C50919,C7989,West,Pacific,31437.75,1189.5625,52.23721,8.650555,18.606528,1
1,349367,COMMERCIAL,CA,62,C50411,C773,West,Pacific,39121.87879,2295.939394,42.301121,8.487175,20.113179,1
2,138632,COMMERCIAL,TX,43,C50112,C773,South,West South Central,21996.68333,626.236667,40.108207,7.642753,14.839351,1
3,617843,COMMERCIAL,CA,45,C50212,C773,West,Pacific,32795.32558,1896.22093,42.070075,7.229393,15.894123,0
4,817482,COMMERCIAL,ID,55,1749,C773,West,Mountain,10886.26,116.886,41.356058,4.110749,11.722197,0


In [257]:
sampled_df.shape

(12825, 14)

In [258]:
sampled_df.dtypes

patient_id                            int64
payer_type                           object
patient_state                        object
patient_age                           int64
breast_cancer_diagnosis_code         object
metastatic_cancer_diagnosis_code     object
Region                               object
Division                             object
population                          float64
density                             float64
Ozone                               float64
PM25                                float64
N02                                 float64
DiagPeriodL90D                        int64
dtype: object

## Now that data is cleaned, moving on to fitting models on the data

In [259]:
sampled_df.shape

(12825, 14)

In [280]:
X = sampled_df.drop(columns=['DiagPeriodL90D'])  # Excluding patient_id from features
y = sampled_df['DiagPeriodL90D']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    ('normalizer', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# Fit model
clf.fit(X_train, y_train)

# Predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Train Accuracy: 0.8122157244964262
Test Accuracy: 0.81364522417154


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [261]:
X_train.shape

(7695, 13)

In [270]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier

In [273]:
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'CatBoost' : CatBoostClassifier()
}

results = {}

for name, clf in classifiers.items():
    classifier_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                          ('classifier', clf)])
    
    classifier_pipeline.fit(X_train, y_train)
    if name == 'CatBoost':
        probabilities = classifier_pipeline.predict_proba(df_test)

        test_patient_ids = df_test['patient_id']

        results_df = pd.DataFrame({'patient_id': test_patient_ids, 'probability_DiagPeriodL90D': probabilities[:, 1]})

        results_df.to_csv("submission_test.csv", index=False)
    y_pred_train = classifier_pipeline.predict(X_train)
    y_pred_test = classifier_pipeline.predict(X_test)
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    print(name, 'train_accuracy: ', train_accuracy, 'test_accuracy: ', test_accuracy)
    results[name] = {'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression train_accuracy:  0.8122157244964262 test_accuracy:  0.81364522417154
SVM train_accuracy:  0.8105263157894737 test_accuracy:  0.8138401559454191
Random Forest train_accuracy:  1.0 test_accuracy:  0.8001949317738791
AdaBoost train_accuracy:  0.8123456790123457 test_accuracy:  0.8116959064327486
Learning rate set to 0.024623
0:	learn: 0.6825353	total: 2.12ms	remaining: 2.12s
1:	learn: 0.6726086	total: 3.67ms	remaining: 1.83s
2:	learn: 0.6630852	total: 5.5ms	remaining: 1.83s
3:	learn: 0.6539856	total: 7.49ms	remaining: 1.86s
4:	learn: 0.6455576	total: 9.48ms	remaining: 1.89s
5:	learn: 0.6383466	total: 11.7ms	remaining: 1.93s
6:	learn: 0.6303153	total: 13.7ms	remaining: 1.94s
7:	learn: 0.6230782	total: 15.8ms	remaining: 1.96s
8:	learn: 0.6161082	total: 17.9ms	remaining: 1.97s
9:	learn: 0.6095795	total: 20ms	remaining: 1.98s
10:	learn: 0.6035152	total: 22.1ms	remaining: 1.99s
11:	learn: 0.5979737	total: 24.1ms	remaining: 1.99s
12:	learn: 0.5927974	total: 26.1ms	remaining:

181:	learn: 0.4610751	total: 375ms	remaining: 1.69s
182:	learn: 0.4609489	total: 377ms	remaining: 1.68s
183:	learn: 0.4608459	total: 379ms	remaining: 1.68s
184:	learn: 0.4606442	total: 381ms	remaining: 1.68s
185:	learn: 0.4605104	total: 383ms	remaining: 1.68s
186:	learn: 0.4604010	total: 385ms	remaining: 1.68s
187:	learn: 0.4603057	total: 388ms	remaining: 1.67s
188:	learn: 0.4601747	total: 390ms	remaining: 1.67s
189:	learn: 0.4600419	total: 392ms	remaining: 1.67s
190:	learn: 0.4599059	total: 394ms	remaining: 1.67s
191:	learn: 0.4597851	total: 396ms	remaining: 1.67s
192:	learn: 0.4595961	total: 398ms	remaining: 1.66s
193:	learn: 0.4594370	total: 400ms	remaining: 1.66s
194:	learn: 0.4592447	total: 402ms	remaining: 1.66s
195:	learn: 0.4590285	total: 404ms	remaining: 1.66s
196:	learn: 0.4588687	total: 406ms	remaining: 1.65s
197:	learn: 0.4587854	total: 408ms	remaining: 1.65s
198:	learn: 0.4586480	total: 410ms	remaining: 1.65s
199:	learn: 0.4584970	total: 412ms	remaining: 1.65s
200:	learn: 

373:	learn: 0.4367860	total: 772ms	remaining: 1.29s
374:	learn: 0.4365528	total: 774ms	remaining: 1.29s
375:	learn: 0.4364607	total: 776ms	remaining: 1.29s
376:	learn: 0.4363144	total: 778ms	remaining: 1.29s
377:	learn: 0.4362085	total: 780ms	remaining: 1.28s
378:	learn: 0.4361090	total: 782ms	remaining: 1.28s
379:	learn: 0.4358997	total: 784ms	remaining: 1.28s
380:	learn: 0.4358151	total: 786ms	remaining: 1.28s
381:	learn: 0.4356447	total: 788ms	remaining: 1.27s
382:	learn: 0.4354988	total: 791ms	remaining: 1.27s
383:	learn: 0.4354133	total: 793ms	remaining: 1.27s
384:	learn: 0.4353147	total: 795ms	remaining: 1.27s
385:	learn: 0.4351082	total: 797ms	remaining: 1.27s
386:	learn: 0.4349854	total: 799ms	remaining: 1.26s
387:	learn: 0.4348334	total: 801ms	remaining: 1.26s
388:	learn: 0.4347186	total: 803ms	remaining: 1.26s
389:	learn: 0.4344860	total: 805ms	remaining: 1.26s
390:	learn: 0.4343451	total: 807ms	remaining: 1.26s
391:	learn: 0.4342723	total: 809ms	remaining: 1.25s
392:	learn: 

566:	learn: 0.4132931	total: 1.17s	remaining: 894ms
567:	learn: 0.4131117	total: 1.17s	remaining: 892ms
568:	learn: 0.4131007	total: 1.17s	remaining: 890ms
569:	learn: 0.4129752	total: 1.18s	remaining: 888ms
570:	learn: 0.4128492	total: 1.18s	remaining: 886ms
571:	learn: 0.4127429	total: 1.18s	remaining: 884ms
572:	learn: 0.4125854	total: 1.18s	remaining: 882ms
573:	learn: 0.4125073	total: 1.19s	remaining: 880ms
574:	learn: 0.4124499	total: 1.19s	remaining: 878ms
575:	learn: 0.4121747	total: 1.19s	remaining: 876ms
576:	learn: 0.4120325	total: 1.19s	remaining: 873ms
577:	learn: 0.4120213	total: 1.19s	remaining: 871ms
578:	learn: 0.4119117	total: 1.2s	remaining: 869ms
579:	learn: 0.4118228	total: 1.2s	remaining: 867ms
580:	learn: 0.4116718	total: 1.2s	remaining: 865ms
581:	learn: 0.4115253	total: 1.2s	remaining: 863ms
582:	learn: 0.4114185	total: 1.2s	remaining: 861ms
583:	learn: 0.4112917	total: 1.21s	remaining: 859ms
584:	learn: 0.4111357	total: 1.21s	remaining: 857ms
585:	learn: 0.411

755:	learn: 0.3923031	total: 1.56s	remaining: 504ms
756:	learn: 0.3922261	total: 1.56s	remaining: 502ms
757:	learn: 0.3921714	total: 1.56s	remaining: 500ms
758:	learn: 0.3920717	total: 1.57s	remaining: 497ms
759:	learn: 0.3918280	total: 1.57s	remaining: 495ms
760:	learn: 0.3916634	total: 1.57s	remaining: 493ms
761:	learn: 0.3915114	total: 1.57s	remaining: 491ms
762:	learn: 0.3913463	total: 1.57s	remaining: 489ms
763:	learn: 0.3912504	total: 1.58s	remaining: 487ms
764:	learn: 0.3911452	total: 1.58s	remaining: 485ms
765:	learn: 0.3910624	total: 1.58s	remaining: 483ms
766:	learn: 0.3909031	total: 1.58s	remaining: 481ms
767:	learn: 0.3908183	total: 1.58s	remaining: 479ms
768:	learn: 0.3907230	total: 1.59s	remaining: 477ms
769:	learn: 0.3905691	total: 1.59s	remaining: 475ms
770:	learn: 0.3904634	total: 1.59s	remaining: 473ms
771:	learn: 0.3903388	total: 1.59s	remaining: 471ms
772:	learn: 0.3902259	total: 1.59s	remaining: 469ms
773:	learn: 0.3901120	total: 1.6s	remaining: 466ms
774:	learn: 0

947:	learn: 0.3725580	total: 1.96s	remaining: 107ms
948:	learn: 0.3724417	total: 1.96s	remaining: 105ms
949:	learn: 0.3723243	total: 1.96s	remaining: 103ms
950:	learn: 0.3722380	total: 1.96s	remaining: 101ms
951:	learn: 0.3721648	total: 1.97s	remaining: 99.2ms
952:	learn: 0.3719883	total: 1.97s	remaining: 97.1ms
953:	learn: 0.3718927	total: 1.97s	remaining: 95.1ms
954:	learn: 0.3717289	total: 1.97s	remaining: 93ms
955:	learn: 0.3715869	total: 1.98s	remaining: 90.9ms
956:	learn: 0.3714626	total: 1.98s	remaining: 88.9ms
957:	learn: 0.3713403	total: 1.98s	remaining: 86.8ms
958:	learn: 0.3712005	total: 1.98s	remaining: 84.7ms
959:	learn: 0.3711256	total: 1.98s	remaining: 82.7ms
960:	learn: 0.3709964	total: 1.99s	remaining: 80.6ms
961:	learn: 0.3708751	total: 1.99s	remaining: 78.5ms
962:	learn: 0.3707860	total: 1.99s	remaining: 76.5ms
963:	learn: 0.3706793	total: 1.99s	remaining: 74.4ms
964:	learn: 0.3705596	total: 1.99s	remaining: 72.3ms
965:	learn: 0.3705012	total: 2s	remaining: 70.3ms
96

In [284]:
sampled_df.shape

(12825, 14)

In [308]:
from sklearn.model_selection import GridSearchCV



clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', CatBoostClassifier(learning_rate=0.95))])

# Fit model
clf.fit(X_train, y_train)

# Predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

0:	learn: 0.4982147	total: 3.06ms	remaining: 3.06s
1:	learn: 0.4747828	total: 5.84ms	remaining: 2.92s
2:	learn: 0.4666341	total: 8.54ms	remaining: 2.84s
3:	learn: 0.4604580	total: 11ms	remaining: 2.75s
4:	learn: 0.4572190	total: 13.4ms	remaining: 2.66s
5:	learn: 0.4537930	total: 15.4ms	remaining: 2.56s
6:	learn: 0.4512339	total: 17.4ms	remaining: 2.46s
7:	learn: 0.4463034	total: 19.4ms	remaining: 2.4s
8:	learn: 0.4447579	total: 21.4ms	remaining: 2.35s
9:	learn: 0.4400672	total: 23.5ms	remaining: 2.33s
10:	learn: 0.4354699	total: 25.6ms	remaining: 2.31s
11:	learn: 0.4318494	total: 27.7ms	remaining: 2.28s
12:	learn: 0.4290539	total: 29.6ms	remaining: 2.25s
13:	learn: 0.4250914	total: 31.8ms	remaining: 2.24s
14:	learn: 0.4246422	total: 34ms	remaining: 2.23s
15:	learn: 0.4210263	total: 35.9ms	remaining: 2.21s
16:	learn: 0.4171120	total: 37.9ms	remaining: 2.19s
17:	learn: 0.4143589	total: 39.9ms	remaining: 2.18s
18:	learn: 0.4100025	total: 42.1ms	remaining: 2.17s
19:	learn: 0.4051823	total:

188:	learn: 0.1224439	total: 398ms	remaining: 1.71s
189:	learn: 0.1217950	total: 400ms	remaining: 1.71s
190:	learn: 0.1209139	total: 403ms	remaining: 1.71s
191:	learn: 0.1200391	total: 405ms	remaining: 1.7s
192:	learn: 0.1192747	total: 407ms	remaining: 1.7s
193:	learn: 0.1181854	total: 409ms	remaining: 1.7s
194:	learn: 0.1177479	total: 412ms	remaining: 1.7s
195:	learn: 0.1166835	total: 414ms	remaining: 1.7s
196:	learn: 0.1157974	total: 416ms	remaining: 1.7s
197:	learn: 0.1150940	total: 418ms	remaining: 1.69s
198:	learn: 0.1146529	total: 420ms	remaining: 1.69s
199:	learn: 0.1142171	total: 422ms	remaining: 1.69s
200:	learn: 0.1135736	total: 425ms	remaining: 1.69s
201:	learn: 0.1131697	total: 427ms	remaining: 1.69s
202:	learn: 0.1121474	total: 429ms	remaining: 1.68s
203:	learn: 0.1112115	total: 431ms	remaining: 1.68s
204:	learn: 0.1103895	total: 433ms	remaining: 1.68s
205:	learn: 0.1094886	total: 435ms	remaining: 1.68s
206:	learn: 0.1089147	total: 438ms	remaining: 1.68s
207:	learn: 0.1079

375:	learn: 0.0439357	total: 795ms	remaining: 1.32s
376:	learn: 0.0438561	total: 797ms	remaining: 1.32s
377:	learn: 0.0434701	total: 799ms	remaining: 1.31s
378:	learn: 0.0432758	total: 802ms	remaining: 1.31s
379:	learn: 0.0431301	total: 804ms	remaining: 1.31s
380:	learn: 0.0427527	total: 806ms	remaining: 1.31s
381:	learn: 0.0423453	total: 808ms	remaining: 1.31s
382:	learn: 0.0420761	total: 810ms	remaining: 1.3s
383:	learn: 0.0418291	total: 812ms	remaining: 1.3s
384:	learn: 0.0416324	total: 815ms	remaining: 1.3s
385:	learn: 0.0414558	total: 817ms	remaining: 1.3s
386:	learn: 0.0411719	total: 820ms	remaining: 1.3s
387:	learn: 0.0409503	total: 822ms	remaining: 1.3s
388:	learn: 0.0408630	total: 824ms	remaining: 1.29s
389:	learn: 0.0405364	total: 826ms	remaining: 1.29s
390:	learn: 0.0403586	total: 828ms	remaining: 1.29s
391:	learn: 0.0401856	total: 830ms	remaining: 1.29s
392:	learn: 0.0400407	total: 832ms	remaining: 1.28s
393:	learn: 0.0398359	total: 834ms	remaining: 1.28s
394:	learn: 0.0396

573:	learn: 0.0233836	total: 1.19s	remaining: 882ms
574:	learn: 0.0233684	total: 1.19s	remaining: 880ms
575:	learn: 0.0233677	total: 1.19s	remaining: 877ms
576:	learn: 0.0233669	total: 1.19s	remaining: 875ms
577:	learn: 0.0233663	total: 1.2s	remaining: 873ms
578:	learn: 0.0233416	total: 1.2s	remaining: 871ms
579:	learn: 0.0233410	total: 1.2s	remaining: 869ms
580:	learn: 0.0233403	total: 1.2s	remaining: 867ms
581:	learn: 0.0233401	total: 1.2s	remaining: 864ms
582:	learn: 0.0232777	total: 1.21s	remaining: 862ms
583:	learn: 0.0232739	total: 1.21s	remaining: 860ms
584:	learn: 0.0232563	total: 1.21s	remaining: 858ms
585:	learn: 0.0231928	total: 1.21s	remaining: 856ms
586:	learn: 0.0230663	total: 1.21s	remaining: 854ms
587:	learn: 0.0230253	total: 1.22s	remaining: 852ms
588:	learn: 0.0230252	total: 1.22s	remaining: 849ms
589:	learn: 0.0230228	total: 1.22s	remaining: 847ms
590:	learn: 0.0229544	total: 1.22s	remaining: 845ms
591:	learn: 0.0229340	total: 1.22s	remaining: 844ms
592:	learn: 0.022

776:	learn: 0.0176066	total: 1.58s	remaining: 454ms
777:	learn: 0.0175881	total: 1.58s	remaining: 452ms
778:	learn: 0.0175881	total: 1.59s	remaining: 450ms
779:	learn: 0.0175878	total: 1.59s	remaining: 448ms
780:	learn: 0.0175878	total: 1.59s	remaining: 446ms
781:	learn: 0.0175874	total: 1.59s	remaining: 444ms
782:	learn: 0.0175873	total: 1.59s	remaining: 442ms
783:	learn: 0.0175559	total: 1.6s	remaining: 440ms
784:	learn: 0.0175552	total: 1.6s	remaining: 438ms
785:	learn: 0.0175551	total: 1.6s	remaining: 436ms
786:	learn: 0.0175481	total: 1.6s	remaining: 434ms
787:	learn: 0.0175479	total: 1.6s	remaining: 432ms
788:	learn: 0.0175475	total: 1.6s	remaining: 429ms
789:	learn: 0.0175120	total: 1.61s	remaining: 427ms
790:	learn: 0.0174433	total: 1.61s	remaining: 425ms
791:	learn: 0.0174429	total: 1.61s	remaining: 423ms
792:	learn: 0.0174027	total: 1.61s	remaining: 421ms
793:	learn: 0.0174025	total: 1.61s	remaining: 419ms
794:	learn: 0.0173528	total: 1.62s	remaining: 417ms
795:	learn: 0.0173

986:	learn: 0.0143586	total: 1.98s	remaining: 26.1ms
987:	learn: 0.0143584	total: 1.98s	remaining: 24.1ms
988:	learn: 0.0143584	total: 1.98s	remaining: 22.1ms
989:	learn: 0.0143583	total: 1.99s	remaining: 20.1ms
990:	learn: 0.0143583	total: 1.99s	remaining: 18ms
991:	learn: 0.0143583	total: 1.99s	remaining: 16ms
992:	learn: 0.0143582	total: 1.99s	remaining: 14ms
993:	learn: 0.0143582	total: 1.99s	remaining: 12ms
994:	learn: 0.0143581	total: 1.99s	remaining: 10ms
995:	learn: 0.0143580	total: 2s	remaining: 8.02ms
996:	learn: 0.0143580	total: 2s	remaining: 6.01ms
997:	learn: 0.0143578	total: 2s	remaining: 4.01ms
998:	learn: 0.0143578	total: 2s	remaining: 2ms
999:	learn: 0.0143577	total: 2s	remaining: 0us
Train Accuracy: 0.9998700454840805
Test Accuracy: 0.7569200779727095


In [309]:
probabilities = clf.predict_proba(df_test)

test_patient_ids = df_test['patient_id']

results_df = pd.DataFrame({'patient_id': test_patient_ids, 'probability_DiagPeriodL90D': probabilities[:, 1]})

results_df.to_csv("submission_test.csv", index=False)

In [312]:
from xgboost import XGBClassifier

# Define XGBoost classifier with chosen hyperparameters

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGBClassifier())])

# Fit model
clf.fit(X_train, y_train)

# Predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")


Train Accuracy: 0.8820012995451592
Test Accuracy: 0.7984405458089668


In [314]:
!pip install --no-binary lightgbm lightgbm

Collecting lightgbm
  Using cached lightgbm-4.3.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for lightgbm [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[42 lines of output][0m
  [31m   [0m 2024-02-22 23:04:16,586 - scikit_build_core - INFO - RUN: /private/var/folders/x8/8bvby9d128q0p10dt1mp3wgh0000gn/T/pip-build-env-qs7utcit/normal/lib/python3.11/site-packages/cmake/data/bin/cmake --version
  [31m   [0m 2024-02-22 23:04:16,592 - scikit_build_core - INFO - CMake version: 3.28.3
  [31m   [0m [92m***[0

## Preparing the test data for kaggle submission

In [209]:
test_file_path = r'./test.csv'

df_test = pd.read_csv(test_file_path)


In [272]:

probabilities = classifiers[4].predict_proba(df_test)

test_patient_ids = df_test['patient_id']

results_df = pd.DataFrame({'patient_id': test_patient_ids, 'probability_DiagPeriodL90D': probabilities[:, 1]})

results_df.to_csv("submission_test.csv", index=False)


KeyError: 4

In [199]:
sampled_df_test.dtypes

patient_id                            int64
payer_type                           object
patient_state                        object
patient_age                           int64
breast_cancer_diagnosis_code         object
metastatic_cancer_diagnosis_code     object
Region                               object
Division                             object
population                          float64
density                             float64
Ozone                               float64
PM25                                float64
N02                                 float64
dtype: object

In [185]:
sampled_df_test.shape

(5757, 13)