In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_csv("/kaggle/input/smoking-related-lung-cancers/lung_cancer.csv")
data.head(5)

Unnamed: 0,pid,age,gender,race,smoker,days_to_cancer,stage_of_cancer
0,100001,70,Male,White,Current,,
1,100002,66,Male,White,Current,,
2,100003,64,Male,White,Current,,
3,100004,60,Male,White,Former,,
4,100005,64,Male,White,Former,,


In [3]:
data.tail()

Unnamed: 0,pid,age,gender,race,smoker,days_to_cancer,stage_of_cancer
53422,218890,73,Female,White,Current,,
53423,218891,66,Male,White,Current,,
53424,218892,56,Male,White,Former,,
53425,218893,69,Male,White,Former,,
53426,218894,57,Male,White,Current,,


In [4]:
kişi_sayısı = data['pid'].nunique()
print("Kişi Sayısı:", kişi_sayısı)

Kişi Sayısı: 53427


In [5]:
data.columns

Index(['pid', 'age', 'gender', 'race', 'smoker', 'days_to_cancer',
       'stage_of_cancer'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53427 entries, 0 to 53426
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              53427 non-null  int64  
 1   age              53427 non-null  int64  
 2   gender           53427 non-null  object 
 3   race             53166 non-null  object 
 4   smoker           53427 non-null  object 
 5   days_to_cancer   2033 non-null   float64
 6   stage_of_cancer  2032 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 2.9+ MB


In [7]:
data.describe()

Unnamed: 0,pid,age,days_to_cancer
count,53427.0,53427.0,2033.0
mean,149778.048346,61.417448,1047.207083
std,44860.640458,5.023601,743.052899
min,100001.0,43.0,0.0
25%,113363.5,57.0,398.0
50%,126734.0,60.0,928.0
75%,205491.5,65.0,1722.0
max,218894.0,79.0,2618.0


In [8]:
data['age'].max()

79

In [9]:
data['stage_of_cancer'].value_counts()

stage_of_cancer
IA      621
IV      596
IIIB    246
IIIA    213
IB      210
IIB      79
IIA      67
Name: count, dtype: int64

In [10]:
# days_to_cancer sütunundaki benzersiz değerleri döndürmeyi sağlar.
data['days_to_cancer'].value_counts()

days_to_cancer
98.0      8
454.0     7
49.0      7
29.0      7
17.0      6
         ..
110.0     1
2536.0    1
471.0     1
2209.0    1
1632.0    1
Name: count, Length: 1293, dtype: int64

In [11]:
data['days_to_cancer'].value_counts().sum()

2033

In [12]:
data.days_to_cancer.median()

928.0

In [13]:
data.age.median()

60.0

In [14]:
data = data.fillna(0)
print(data)
data.isnull()

          pid  age  gender   race   smoker  days_to_cancer stage_of_cancer
0      100001   70    Male  White  Current             0.0               0
1      100002   66    Male  White  Current             0.0               0
2      100003   64    Male  White  Current             0.0               0
3      100004   60    Male  White   Former             0.0               0
4      100005   64    Male  White   Former             0.0               0
...       ...  ...     ...    ...      ...             ...             ...
53422  218890   73  Female  White  Current             0.0               0
53423  218891   66    Male  White  Current             0.0               0
53424  218892   56    Male  White   Former             0.0               0
53425  218893   69    Male  White   Former             0.0               0
53426  218894   57    Male  White  Current             0.0               0

[53427 rows x 7 columns]


Unnamed: 0,pid,age,gender,race,smoker,days_to_cancer,stage_of_cancer
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
53422,False,False,False,False,False,False,False
53423,False,False,False,False,False,False,False
53424,False,False,False,False,False,False,False
53425,False,False,False,False,False,False,False


In [15]:
data['stage_changed'] = data['stage_of_cancer'].map({
    '0': 0,  # NaN değerleri
    'IA': 1,
    'IB': 1,
    'IIA': 2,
    'IIB': 2,
    'IIIA': 3,
    'IIIB': 3,
    'IV': 4,
})

print("Kategorik Değişkenler:")
print(data[['stage_of_cancer']])
print("Nümerik Değişkenler:")
print(data[['stage_changed']])

Kategorik Değişkenler:
      stage_of_cancer
0                   0
1                   0
2                   0
3                   0
4                   0
...               ...
53422               0
53423               0
53424               0
53425               0
53426               0

[53427 rows x 1 columns]
Nümerik Değişkenler:
       stage_changed
0                NaN
1                NaN
2                NaN
3                NaN
4                NaN
...              ...
53422            NaN
53423            NaN
53424            NaN
53425            NaN
53426            NaN

[53427 rows x 1 columns]


In [16]:
print(data)

          pid  age  gender   race   smoker  days_to_cancer stage_of_cancer  \
0      100001   70    Male  White  Current             0.0               0   
1      100002   66    Male  White  Current             0.0               0   
2      100003   64    Male  White  Current             0.0               0   
3      100004   60    Male  White   Former             0.0               0   
4      100005   64    Male  White   Former             0.0               0   
...       ...  ...     ...    ...      ...             ...             ...   
53422  218890   73  Female  White  Current             0.0               0   
53423  218891   66    Male  White  Current             0.0               0   
53424  218892   56    Male  White   Former             0.0               0   
53425  218893   69    Male  White   Former             0.0               0   
53426  218894   57    Male  White  Current             0.0               0   

       stage_changed  
0                NaN  
1                

In [18]:
data['stage_changed'].fillna(0, inplace=True)
print(data)

          pid  age  gender   race   smoker  days_to_cancer stage_of_cancer  \
0      100001   70    Male  White  Current             0.0               0   
1      100002   66    Male  White  Current             0.0               0   
2      100003   64    Male  White  Current             0.0               0   
3      100004   60    Male  White   Former             0.0               0   
4      100005   64    Male  White   Former             0.0               0   
...       ...  ...     ...    ...      ...             ...             ...   
53422  218890   73  Female  White  Current             0.0               0   
53423  218891   66    Male  White  Current             0.0               0   
53424  218892   56    Male  White   Former             0.0               0   
53425  218893   69    Male  White   Former             0.0               0   
53426  218894   57    Male  White  Current             0.0               0   

       stage_changed  
0                0.0  
1                

In [19]:
data['race'].fillna(0, inplace=True)

race_mapping = {
    'nan':0,
    'White': 1,
    'Native Hawaiian or Other Pacific Islander': 2,
    'More than one race': 3,
    'Asian': 4,
    'Black or African-American': 5,
    'American Indian or Alaskan Native': 6,
    'Participant refused to answer': 7,
}

data['race_changed'] = data['race'].map(race_mapping)

print("Kategorik Değişkenler:")
print(data[['race']])
print("Nümerik Değişkenler:")
print(data[['race_changed']])

Kategorik Değişkenler:
        race
0      White
1      White
2      White
3      White
4      White
...      ...
53422  White
53423  White
53424  White
53425  White
53426  White

[53427 rows x 1 columns]
Nümerik Değişkenler:
       race_changed
0               1.0
1               1.0
2               1.0
3               1.0
4               1.0
...             ...
53422           1.0
53423           1.0
53424           1.0
53425           1.0
53426           1.0

[53427 rows x 1 columns]


In [20]:
data['race_changed'].fillna(0, inplace=True)
unique_smoker = data['race_changed'].unique()
for i, smoker in enumerate(unique_smoker, 1):
    print(f"{i}) {smoker}")

1) 1.0
2) 2.0
3) 3.0
4) 4.0
5) 0.0
6) 5.0
7) 6.0
8) 7.0


In [21]:
smoker_mapping = {'Current': 0, 'Former': 1}

data['smoker_changed'] = data['smoker'].map(smoker_mapping)
print("Kategorik Değişkenler:")
print(data[['smoker']])
print("Nümerik Değişkenler:")
print(data[['smoker_changed']])


Kategorik Değişkenler:
        smoker
0      Current
1      Current
2      Current
3       Former
4       Former
...        ...
53422  Current
53423  Current
53424   Former
53425   Former
53426  Current

[53427 rows x 1 columns]
Nümerik Değişkenler:
       smoker_changed
0                   0
1                   0
2                   0
3                   1
4                   1
...               ...
53422               0
53423               0
53424               1
53425               1
53426               0

[53427 rows x 1 columns]


In [22]:
gender_mapping= {'Male':0,'Female':1}

data['gender_changed'] = data['gender'].map(gender_mapping)

print("Kategorik Değişkenler:")
print(data[['gender']])
print("Nümerik Değişkenler:")
print(data[['gender_changed']])

Kategorik Değişkenler:
       gender
0        Male
1        Male
2        Male
3        Male
4        Male
...       ...
53422  Female
53423    Male
53424    Male
53425    Male
53426    Male

[53427 rows x 1 columns]
Nümerik Değişkenler:
       gender_changed
0                   0
1                   0
2                   0
3                   0
4                   0
...               ...
53422               1
53423               0
53424               0
53425               0
53426               0

[53427 rows x 1 columns]


In [23]:
print(data)

          pid  age  gender   race   smoker  days_to_cancer stage_of_cancer  \
0      100001   70    Male  White  Current             0.0               0   
1      100002   66    Male  White  Current             0.0               0   
2      100003   64    Male  White  Current             0.0               0   
3      100004   60    Male  White   Former             0.0               0   
4      100005   64    Male  White   Former             0.0               0   
...       ...  ...     ...    ...      ...             ...             ...   
53422  218890   73  Female  White  Current             0.0               0   
53423  218891   66    Male  White  Current             0.0               0   
53424  218892   56    Male  White   Former             0.0               0   
53425  218893   69    Male  White   Former             0.0               0   
53426  218894   57    Male  White  Current             0.0               0   

       stage_changed  race_changed  smoker_changed  gender_chan

In [24]:
# Bağımsız değişkenler (X) 
X = data[['smoker_changed', 'age', 'gender_changed','stage_changed','race_changed']]

# Bağımlı değişken 
y = data['days_to_cancer']

# X ve y'yi yazdır
print("Bağımsız Değişkenler (X):")
print(X)

print("Bağımlı Değişken (y):")
print(y)



Bağımsız Değişkenler (X):
       smoker_changed  age  gender_changed  stage_changed  race_changed
0                   0   70               0            0.0           1.0
1                   0   66               0            0.0           1.0
2                   0   64               0            0.0           1.0
3                   1   60               0            0.0           1.0
4                   1   64               0            0.0           1.0
...               ...  ...             ...            ...           ...
53422               0   73               1            0.0           1.0
53423               0   66               0            0.0           1.0
53424               1   56               0            0.0           1.0
53425               1   69               0            0.0           1.0
53426               0   57               0            0.0           1.0

[53427 rows x 5 columns]
Bağımlı Değişken (y):
0        0.0
1        0.0
2        0.0
3        0.0
4        0

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

In [26]:
# Veriyi eğitim ve test setlerine bölelim
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

print("X_train")
print(X_train)
print("********")
print("X_test")
print(X_test)
print("********")
print("y_train")
print(y_train)
print("********")
print("y_test")
print(y_test)

X_train
       smoker_changed  age  gender_changed  stage_changed  race_changed
2019                1   73               0            0.0           1.0
38886               0   63               0            0.0           1.0
13146               1   56               1            0.0           1.0
8215                1   61               0            0.0           4.0
46085               0   65               1            0.0           1.0
...               ...  ...             ...            ...           ...
35702               1   69               0            0.0           1.0
26767               1   73               0            0.0           1.0
6618                0   70               0            0.0           1.0
24894               0   70               0            0.0           5.0
29828               1   58               0            0.0           1.0

[42741 rows x 5 columns]
********
X_test
       smoker_changed  age  gender_changed  stage_changed  race_changed
15701         

In [27]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

# Modeli eğit
lr_model.fit(X_train, y_train)

# Tahmin yap
y_pred = lr_model.predict(X_test)


# Modelin performansını değerlendir
mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)


evs = explained_variance_score(y_test, y_pred)

# Sonuçları yazdır
print('Linear Regression')
print("Mse ",mse)
print('Mae',mae)
print("R-square",r2)
print('Explained Variance Score',evs)

Linear Regression
Mse  24441.833022139166
Mae 30.873206700945858
R-square 0.6077662166141695
Explained Variance Score 0.6078103934683772


In [28]:
from sklearn.tree import DecisionTreeRegressor
dcsmodel =DecisionTreeRegressor()

dcsmodel.fit(X_train , y_train)

y_pred = dcsmodel.predict(X_test)

mse = mean_squared_error(y_test , y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('Decision Tree Regressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

Decision Tree Regressor
Mse  26098.983769090173
Mae  25.796504250111777
R-square  0.5811728548753632
Explained Variance Score 0.581172877586118


In [29]:
from sklearn.ensemble import RandomForestRegressor
rndFrstModel = RandomForestRegressor()

rndFrstModel.fit(X_train , y_train)

y_pred= rndFrstModel.predict(X_test)

mse = mean_squared_error(y_test , y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('Random Forest Regressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

Random Forest Regressor
Mse  23097.480566014594
Mae  24.529627645267432
R-square  0.6293399033991234
Explained Variance Score 0.6293553680525777


In [30]:
from sklearn.linear_model import Ridge 
rdg = Ridge()

rdg.fit(X_train , y_train)

y_pred = rdg.predict(X_test)

mse = mean_squared_error(y_test , y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs =explained_variance_score(y_test, y_pred)

print('Ridge')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score' , evs)


Ridge
Mse  24442.143671159392
Mae  30.876007274539145
R-square  0.6077612314299405
Explained Variance Score 0.6078054051297885


In [31]:
from sklearn.linear_model import  Lasso 

las = Lasso()

las.fit(X_train, y_train)

y_pred =las.predict(X_test)

mse = mean_squared_error(y_test , y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('Lasso')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

Lasso
Mse  24479.116385984977
Mae  31.246863004949144
R-square  0.6071679065428525
Explained Variance Score 0.6072097025243645


In [32]:
from sklearn.linear_model import  ElasticNet

elsc = ElasticNet() 

elsc.fit(X_train ,y_train )

y_pred =elsc.predict(X_test)

mse = mean_squared_error(y_test , y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('ElasticNet')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

ElasticNet
Mse  41137.961620909686
Mae  58.044955979625676
R-square  0.33983272397225894
Explained Variance Score 0.3398552358579444


In [33]:
from sklearn.ensemble import GradientBoostingRegressor
grdb =GradientBoostingRegressor()

grdb.fit(X_train,y_train)

y_pred=grdb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('GradientBoostingRegressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

GradientBoostingRegressor
Mse  19520.51742135561
Mae  23.2073193673695
R-square  0.6867417269853676
Explained Variance Score 0.686780991886916


In [None]:
from sklearn.svm import SVR

svr =SVR(kernel='linear')

svr.fit(X_train,y_train)

y_pred = svr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('SVR')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr =KNeighborsRegressor()

knr.fit(X_train,y_train)

y_pred = svr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('KNeighborsRegressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

In [None]:
from xgboost import XGBRegressor
xgb=XGBRegressor( )

xgb.fit(X_train, y_train)

y_pred=xgb.predict(X_test) 

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('XGBRegressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

In [None]:
from lightgbm import LGBMRegressor
lgb= LGBMRegressor()

lgb.fit(X_train , y_train)

y_pred = lgb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)


print('LGBMRegressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

In [None]:
from catboost import CatBoostRegressor  
cat =CatBoostRegressor(  )

cat.fit(X_train,y_train)

y_pred=cat.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae =mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test , y_pred)
evs = explained_variance_score(y_test, y_pred)

print('CatBoostRegressor')
print('Mse ',mse)
print('Mae ',mae)
print('R-square ',r2)
print('Explained Variance Score',evs)

In [None]:
from tabulate import tabulate

# Sonuçları bir sözlükte topla
results = {
    'Model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Ridge', 'Lasso', 'ElasticNet', 
              'GradientBoostingRegressor', 'SVR', 'KNeighborsRegressor', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor'],
    'MSE': [24441.8330, 26218.2411, 23172.5671, 24442.1437, 24479.1164, 41137.9616, 19520.5174, 25020.4630, 25020.4630, 23628.6613, 20389.6034, 21933.3423],
    'MAE': [30.8732, 25.9022, 24.5815, 30.8760, 31.2469, 58.0450, 23.2073, 24.6157, 24.6157, 24.6172, 23.5475, 24.9133],
    'R-square': [0.6078, 0.5793, 0.6281, 0.6078, 0.6072, 0.3398, 0.6867, 0.5985, 0.5985, 0.6208, 0.6728, 0.6480],
    'Explained Variance Score': [0.6078, 0.5793, 0.6281, 0.6078, 0.6072, 0.3398, 0.6868, 0.6004, 0.6004, 0.6208, 0.6728, 0.6480]
}

# Tabloyu oluştur
table = tabulate(results, headers='keys', tablefmt='fancy_grid', showindex=False)

# Tabloyu yazdır
print(table)
