#Verilerin istatistiksel Olarak İncelenmesi ve Aykırı Değerlerin Ayıklanması

In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [139]:
df = pd.read_csv("temizlenmis_arac_verisi.csv")

In [140]:
print(df.info())     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        1551 non-null   object
 1   model_detay  1551 non-null   object
 2   model_yili   1551 non-null   int64 
 3   fiyat        1551 non-null   int64 
 4   sehir        1551 non-null   object
 5   ilce         1551 non-null   object
 6   ilan_turu    1551 non-null   object
 7   tip          1551 non-null   object
 8   marka        1551 non-null   object
 9   motor_hacmi  1551 non-null   object
 10  motor_tipi   1551 non-null   object
 11  donanim      1551 non-null   object
dtypes: int64(2), object(10)
memory usage: 145.5+ KB
None


In [141]:
df["sehir"]        = df["sehir"].astype("category")
df["ilce"]         = df["ilce"].astype("category")
df["ilan_turu"]    = df["ilan_turu"].astype("category")
df["tip"]          = df["tip"].astype("category")
df["marka"]        = df["marka"].astype("category")
df["motor_hacmi"]  = df["motor_hacmi"].astype("category")
df["motor_tipi"]   = df["motor_tipi"].astype("category")
df["donanim"]      = df["donanim"].astype("category")


In [142]:
print((df.info()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   model        1551 non-null   object  
 1   model_detay  1551 non-null   object  
 2   model_yili   1551 non-null   int64   
 3   fiyat        1551 non-null   int64   
 4   sehir        1551 non-null   category
 5   ilce         1551 non-null   category
 6   ilan_turu    1551 non-null   category
 7   tip          1551 non-null   category
 8   marka        1551 non-null   category
 9   motor_hacmi  1551 non-null   category
 10  motor_tipi   1551 non-null   category
 11  donanim      1551 non-null   category
dtypes: category(8), int64(2), object(2)
memory usage: 97.8+ KB
None


In [143]:
print(df.isnull().sum())

model          0
model_detay    0
model_yili     0
fiyat          0
sehir          0
ilce           0
ilan_turu      0
tip            0
marka          0
motor_hacmi    0
motor_tipi     0
donanim        0
dtype: int64


In [144]:
# 1. Negatif fiyatları pozitife çevir
df["fiyat"] = df["fiyat"].abs()

# 2. Sadece sayısal sütunları seç
columns = df.select_dtypes(include=[np.number]).columns

# 3. Min/max sınırlarını hesapla (IQR yöntemi)
min_values = []
max_values = []

for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_value = Q1 - 1.5 * IQR
    max_value = Q3 + 1.5 * IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"📊 Sütun: {column}, Min sınır: {min_value:.2f}, Max sınır: {max_value:.2f}")


📊 Sütun: model_yili, Min sınır: 1997.50, Max sınır: 2033.50
📊 Sütun: fiyat, Min sınır: -505625.00, Max sınır: 2267375.00


#Aykırı Değerlerin temizlenmesi

In [145]:
for i, column in enumerate(columns):
    df = df[(df[column] >= min_values[i]) & (df[column] <= max_values[i])]


In [146]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1419 entries, 0 to 1550
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   model        1419 non-null   object  
 1   model_detay  1419 non-null   object  
 2   model_yili   1419 non-null   int64   
 3   fiyat        1419 non-null   int64   
 4   sehir        1419 non-null   category
 5   ilce         1419 non-null   category
 6   ilan_turu    1419 non-null   category
 7   tip          1419 non-null   category
 8   marka        1419 non-null   category
 9   motor_hacmi  1419 non-null   category
 10  motor_tipi   1419 non-null   category
 11  donanim      1419 non-null   category
dtypes: category(8), int64(2), object(2)
memory usage: 103.1+ KB
None


In [147]:
pd.set_option("display.max_columns", None)

# Ardından:
print(df.describe(include='all'))  # veya sadece df.describe()


                              model  \
count                          1419   
unique                          622   
top     Fiat Egea 1.3 Multijet Easy   
freq                             21   
mean                            NaN   
std                             NaN   
min                             NaN   
25%                             NaN   
50%                             NaN   
75%                             NaN   
max                             NaN   

                                           model_detay   model_yili  \
count                                             1419  1419.000000   
unique                                             884          NaN   
top     OSCAR'dan DÜŞÜK KM DEĞİŞENSİZ TİGUAN 4 EMİTİON          NaN   
freq                                                 9          NaN   
mean                                               NaN  2014.846371   
std                                                NaN     6.001675   
min                               

In [148]:
df.to_csv("cleaned_data.csv", index=False)