### 2. Data Pre-processing ( Veri Ön İşleme )

In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error , r2_score , mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
df=pd.read_csv("Breast_Cancer.csv")

In [4]:
data2=df.copy()
data2.head(2)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive


In [5]:
for index, satir in data2.iterrows():
    if isinstance(satir['Grade'], str):
        if satir['Grade'] == ' anaplastic; Grade IV':
            data2.at[index, 'Grade'] = '4'

data2['Grade'] = data2['Grade'].astype(int)
data2["Grade"].unique()

array([3, 2, 1, 4])

##### Keşifci veri analizinden sonra:
- Ordinal kategorik değişkenlerim => 'T Stage', 'N Stage', '6th Stage'
- Nominal değişkenlerim => 'Race', 'Marital Status', 'differentiate', 'A Stage', 'Estrogen Status', 'Progesterone Status'
- 'Status' değişkenin ne olduğunu anlayamadım için veri setinden çıkaracağım

In [6]:
label_encoder = LabelEncoder()

data2['T Stage'] = label_encoder.fit_transform(data2['T Stage '])
data2['N Stage'] = label_encoder.fit_transform(data2['N Stage'])
data2['6th Stage'] = label_encoder.fit_transform(data2['6th Stage'])

In [7]:
data2.drop(["Status", "T Stage "], axis=1, inplace=True)

In [8]:
data2.head()

Unnamed: 0,Age,Race,Marital Status,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,T Stage
0,68,White,Married,0,0,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,0
1,50,White,Married,1,2,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,1
2,58,White,Divorced,2,4,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,2
3,58,White,Married,0,0,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,0
4,47,White,Married,0,1,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,1


In [9]:
data_encoded = pd.get_dummies(data2, columns=['Race', 'Marital Status', 'A Stage', 'Estrogen Status', 'Progesterone Status','differentiate'], dtype=int)

In [10]:
data_encoded.rename(columns={'Marital Status_Separated': 'Separated',
                   'Marital Status_Married': 'Married',
                   'Marital Status_Single ': 'Single',
                   'Marital Status_Widowed':'Widowed',
                   'Marital Status_Divorced':'Divorced',
                   'differentiate_Moderately differentiated':'Moderately',
                   'differentiate_Poorly differentiated':'Poorly',
                   'differentiate_Undifferentiated':'Undifferentiated',
                   'differentiate_Well differentiated':'Well',
                   'Race_Black':'Black',
                   'Race_Other':'Other',
                   'Race_White':'White'
                   }, inplace=True)

In [11]:
data_encoded.head()

Unnamed: 0,Age,N Stage,6th Stage,Grade,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,T Stage,Black,...,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive,Moderately,Poorly,Undifferentiated,Well
0,68,0,0,3,4,24,1,60,0,0,...,0,1,0,1,0,1,0,1,0,0
1,50,1,2,2,35,14,5,62,1,0,...,0,1,0,1,0,1,1,0,0,0
2,58,2,4,2,63,14,7,75,2,0,...,0,1,0,1,0,1,1,0,0,0
3,58,0,0,3,18,2,1,84,0,0,...,0,1,0,1,0,1,0,1,0,0
4,47,0,1,3,41,3,1,50,1,0,...,0,1,0,1,0,1,0,1,0,0


In [12]:
data_encoded.columns

Index(['Age', 'N Stage', '6th Stage', 'Grade', 'Tumor Size',
       'Regional Node Examined', 'Reginol Node Positive', 'Survival Months',
       'T Stage', 'Black', 'Other', 'White', 'Divorced', 'Married',
       'Separated', 'Single', 'Widowed', 'A Stage_Distant', 'A Stage_Regional',
       'Estrogen Status_Negative', 'Estrogen Status_Positive',
       'Progesterone Status_Negative', 'Progesterone Status_Positive',
       'Moderately', 'Poorly', 'Undifferentiated', 'Well'],
      dtype='object')

#### Train/Test Split

In [13]:
X = data_encoded.drop('Survival Months', axis=1).values
y = data_encoded["Survival Months"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
column_names = ['Age', 'N Stage', '6th Stage', 'Grade', 'Tumor Size',
       'Regional Node Examined', 'Reginol Node Positive', 'T Stage', 'Black', 'Other', 'White', 'Divorced', 'Married',
       'Separated', 'Single', 'Widowed', 'A Stage_Distant', 'A Stage_Regional',
       'Estrogen Status_Negative', 'Estrogen Status_Positive',
       'Progesterone Status_Negative', 'Progesterone Status_Positive',
       'Moderately', 'Poorly', 'Undifferentiated', 'Well']

In [16]:
# Örnek: X_train ve X_test'i NumPy dizilerine dönüştürme
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Y_train ve Y_test'i NumPy dizilerine dönüştürme
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

In [17]:
# Örnek: X_train_np ve X_test_np'yi DataFrame'e dönüştürme
X_train_df = pd.DataFrame(X_train_np, columns=column_names)
X_test_df = pd.DataFrame(X_test_np, columns=column_names)

# y_train_np ve y_test_np'yi DataFrame'e dönüştürme
y_train_df = pd.DataFrame(y_train_np, columns=['target_column_name'])
y_test_df = pd.DataFrame(y_test_np, columns=['target_column_name'])

In [18]:
X_train_df=X_train_df.sort_index() # verileri sıraladık (indekse göre sıralanıyor)
X_test_df=X_test_df.sort_index()
y_train_df=y_train_df.sort_index()
y_test_df=y_test_df.sort_index()

In [19]:
print ('Train set:', X_train_df.shape,  y_train_df.shape)
print ('Test set:', X_test_df.shape,  y_test_df.shape)

Train set: (3219, 26) (3219, 1)
Test set: (805, 26) (805, 1)


### 1-Linear Regression

In [20]:
model = LinearRegression()
model.fit(X_train_df, y_train_df)

In [21]:
prediction = model.predict(X_test_df)
print(prediction[0:5])

[[74.97226519]
 [73.83854074]
 [71.67316013]
 [71.2093602 ]
 [75.57495473]]


In [22]:
print(y_train_df.head(3))

   target_column_name
0                  69
1                  64
2                  67


In [23]:
print(y_test_df.head(3))

   target_column_name
0                  80
1                  48
2                  61


### Evaluation-LR

In [24]:
mae = mean_absolute_error(y_test_df, prediction)
mse = mean_squared_error(y_test_df, prediction)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_df, prediction)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

MAE: 18.378565175295783
MSE: 502.45535313509424
RMSE: 22.415515901604724
R-squared: 0.059889525617893846


- MAE, MSE ve RMSE değerleri, modelin tahminlerinin gerçek değerlere göre ortalama hata miktarını ölçer. Düşük değerler, modelin genellikle iyi bir performans sergilediğini gösterir.

- R-squared değeri düşük olduğu için, modelin bağımsız değişkenlerin bağımlı değişkeni açıklamakta sınırlı başarıya sahip olduğunu söyleyebiliriz. Bu, modelin veri setindeki değişkenliği açıklamakta zayıf olduğunu gösterir.

### 2-KNN

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
k=4
knn_neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)

In [27]:
knn_neigh

In [28]:
y_pred = knn_neigh.predict(X_test)

In [29]:
print("Train set accuracy:", metrics.accuracy_score(y_train, knn_neigh.predict(X_train)))
print("Test set accuracy:", metrics.accuracy_score(y_test, y_pred))

Train set accuracy: 0.2736874805840323
Test set accuracy: 0.016149068322981366


- Eğitim seti doğruluğu test seti doğruluğundan çok daha yüksek, bu durum aşırı uyuma işaret edebilir. Model eğitim setinde iyi performans gösteriyor olabilir, ancak yeni gözlemler üzerinde kötü performans gösteriyor olabilir.

- Test seti doğruluğu oldukça düşük. Bu durum, modelin veri setindeki desenleri öğrenmekte zorlandığını veya doğru tahminler yapmak için yeterli bilgiye sahip olmadığını gösterir.

### Evaluation-KNN

In [30]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Squared Error (MSE): 1303.6311
Mean Absolute Error (MAE): 29.1491
R-squared (R2): -1.4391


### Model Tune-KNN

In [32]:
knn_params = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# 'euclidean'= öklidyen mesafesi
# 'manhattan distance'= manhattan mesafesi (şehir blokları) aynı düzlemde iki nokta
# 'minkowski distance'= öklidyen ve manhattan mesafelerini genelleyen formüldür;

In [34]:
knn_tune= GridSearchCV(knn_neigh, knn_params, cv=5, scoring='neg_mean_squared_error', verbose=1)

In [35]:
knn_tune.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




In [37]:
# En iyi modelin hiperparametrelerini görüntüle
print(f'En iyi modelin hiperparametreleri: {knn_tune.best_params_}')

# Test setinde tahmin yapın
predictions = knn_tune.best_estimator_.predict(X_test)

# Hata metriklerini hesaplayın
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error (MSE): {mse}')
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f"R-squared (R2): {r2:.4f}")

En iyi modelin hiperparametreleri: {'metric': 'manhattan', 'n_neighbors': 3}
Mean Squared Error (MSE): 1148.4534161490683
Mean Absolute Error (MAE): 27.2087
Root Mean Squared Error (RMSE): 33.88883910890233
R-squared (R2): -1.1488
