In [37]:
# Kütüphaneler
import pandas as pd
import numpy as np

# Veri Setini Okuma
df = pd.read_csv("CCPP_data.csv")
df.head()


Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [38]:
# Veri türleri ve null bilgisi
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [40]:
# Satır - sütun sayısı
df.columns
df.shape


(9568, 5)

In [41]:
# İstatistiksel özet
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AT,9568.0,19.651231,7.452473,1.81,13.51,20.345,25.72,37.11
V,9568.0,54.305804,12.707893,25.36,41.74,52.08,66.54,81.56
AP,9568.0,1013.259078,5.938784,992.89,1009.1,1012.94,1017.26,1033.3
RH,9568.0,73.308978,14.600269,25.56,63.3275,74.975,84.83,100.16
PE,9568.0,454.365009,17.066995,420.26,439.75,451.55,468.43,495.76


In [42]:
# Eksik Veri Kontrolü
df.isna().sum()


AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [44]:
# Yapay eksik veri oluşturma
df_missing = df.copy()
df_missing.loc[5:15, "AT"] = None
df_missing.loc[50:55, "RH"] = None

df_missing.isna().sum()


AT    11
V      0
AP     0
RH     6
PE     0
dtype: int64

In [45]:
# Eksik Veriyi Doldurma 
df_filled = df_missing.copy()
df_filled["AT"] = df_filled["AT"].fillna(df_filled["AT"].mean())
df_filled["RH"] = df_filled["RH"].fillna(df_filled["RH"].median())

df_filled.isna().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [None]:
# Filtreleme (Belirli satırları seçme)
df[df["AT"] > 20] # 20 dereceden sıcak günler
df[(df["AT"] > 20) & (df["RH"] < 50)] # sıcaklık > 20 ve nem < 50 olanlar
df[df["PE"] < df["PE"].mean()].head() # ortalamaenerjinin altındaki örnekler


Unnamed: 0,AT,V,AP,RH,PE,AT_level,AP_level
1,25.18,62.96,1020.04,59.08,444.37,High,High
3,20.86,57.32,1010.24,76.64,446.48,High,Mid
5,26.27,59.44,1012.23,58.77,443.67,High,Mid
10,17.99,43.72,1008.64,75.04,453.02,Low,Mid
11,20.14,46.93,1014.66,64.22,453.99,High,Mid


In [46]:
# Sıralama (Sort)
df.sort_values(by="AT", ascending=False).head()


Unnamed: 0,AT,V,AP,RH,PE
5349,37.11,68.94,1006.23,31.15,429.25
6435,35.77,73.56,1006.36,36.31,430.14
89,35.56,68.94,1006.56,38.75,429.69
8423,35.2,73.56,1006.56,45.72,434.37
2866,35.1,68.27,1006.96,43.51,426.3


In [51]:
# Yeni Sütun Oluşturma
df["AT_level"] = df["AT"].apply(lambda x: "High" if x > 20 else "Low") # Sıcaklık seviyeleri
df["AP_level"] = pd.cut(df["AP"], bins=3, labels=["Low", "Mid", "High"]) # Basıncı kategorize etme
df.head()




Unnamed: 0,AT,V,AP,RH,PE,AT_level,AP_level
0,14.96,41.76,1024.07,73.17,463.26,Low,High
1,25.18,62.96,1020.04,59.08,444.37,High,High
2,5.11,39.4,1012.16,92.14,488.56,Low,Mid
3,20.86,57.32,1010.24,76.64,446.48,High,Mid
4,10.82,37.5,1009.23,96.62,473.9,Low,Mid


In [54]:
# groupby ile toplulaştırma işlemleri
df.groupby("AT_level")["PE"].mean() # Sıcaklık kategorilerine göre ortalama enerji üretimi


AT_level
High    440.431948
Low     469.070301
Name: PE, dtype: float64

In [68]:
# Basınç kategorisine göre ortalama enerji üretimi
df.groupby("AP_level", observed=False)["PE"].mean()

AP_level
Low     443.960411
Mid     452.804191
High    470.904665
Name: PE, dtype: float64

In [57]:
# Tekrarlanan Verileri Kontrol Etme
df.duplicated().sum()

np.int64(41)

In [58]:
# Tekrarlanan Verileri Silmek
df = df.drop_duplicates()

In [None]:
# Her sütunun veri tipi
df.dtypes


AT           float64
V            float64
AP           float64
RH           float64
PE           float64
AT_level      object
AP_level    category
dtype: object

In [60]:
# Sayısal sütunlarda Outlier (aykırı değer) sınırlarını hesaplama

numeric_df = df.select_dtypes(include='number')

Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Aykırı değer alt sınırları:\n", lower_bound)
print("\nAykırı değer üst sınırları:\n", upper_bound)



Aykırı değer alt sınırları:
 AT     -4.7400
V       4.5850
AP    996.9125
RH     31.1625
PE    396.8275
dtype: float64

Aykırı değer üst sınırları:
 AT      43.9800
V      103.6650
AP    1029.3725
RH     117.0625
PE     511.2875
dtype: float64


In [62]:
# Sayısal sütunların korelasyon tablosunu oluşturma
df_numeric = df.select_dtypes(include='number')
df_numeric.corr()



Unnamed: 0,AT,V,AP,RH,PE
AT,1.0,0.843689,-0.508222,-0.543947,-0.947908
V,0.843689,1.0,-0.415718,-0.312214,-0.8699
AP,-0.508222,-0.415718,1.0,0.101631,0.518687
RH,-0.543947,-0.312214,0.101631,1.0,0.391175
PE,-0.947908,-0.8699,0.518687,0.391175,1.0


In [64]:
# Sayısal sütunları Min-Max yöntemiyle normalize etme
df_numeric = df.select_dtypes(include='number')

df_norm = (df_numeric - df_numeric.min()) / (df_numeric.max() - df_numeric.min())
df_norm.head()


Unnamed: 0,AT,V,AP,RH,PE
0,0.372521,0.291815,0.771591,0.638204,0.569536
1,0.66204,0.669039,0.671863,0.44933,0.319338
2,0.093484,0.249822,0.476862,0.892493,0.904636
3,0.53966,0.568683,0.429349,0.684718,0.347285
4,0.255241,0.216014,0.404355,0.952547,0.710464


In [65]:
# Belirtilen kategorik sütunları silerek temiz veri oluşturma
df_clean = df.drop(columns=["AT_level", "AP_level"])
df_clean.head()


Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [66]:
# Temizlenmiş veri setini CSV dosyası olarak kaydetme
df_clean.to_csv("cleaned_CCPP.csv", index=False)


In [67]:
# Kaydedilen temiz veri setini tekrar okuma

pd.read_csv("cleaned_CCPP.csv").head()


Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9
