# Aykırı Gözlem Analizi

## Aykırı Değerleri Yakalamak

In [None]:
import seaborn as sns
df = sns.load_dataset('diamonds')
df = df.select_dtypes(include = ['float64', 'int64']) 
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]

In [None]:
df_table.head()

In [None]:
sns.boxplot(x = df_table);

In [None]:
Q1 = df_table.quantile(0.25)
Q3 = df_table.quantile(0.75)
IQR = Q3-Q1

In [None]:
Q1

In [None]:
Q3

In [None]:
IQR

In [None]:
alt_sinir = Q1- 1.5*IQR
ust_sinir = Q3 + 1.5*IQR

In [None]:
alt_sinir

In [None]:
ust_sinir

In [None]:
(df_table < alt_sinir) | (df_table > ust_sinir)

In [None]:
aykiri_tf = (df_table < alt_sinir)

In [None]:
aykiri_tf.head()

In [None]:
df_table[aykiri_tf]

In [None]:
df_table[aykiri_tf].index

## Aykırı Değer Problemini Çözmek

In [None]:
df_table[aykiri_tf]

### Silme

In [None]:
import pandas as pd

In [None]:
type(df_table)

In [None]:
df_table = pd.DataFrame(df_table)

In [None]:
df_table.shape

In [None]:
t_df = df_table[~((df_table < (alt_sinir)) | (df_table > (ust_sinir))).any(axis = 1)]

In [None]:
t_df.shape

### Ortalama ile Doldurma

In [None]:
import seaborn as sns
df = sns.load_dataset('diamonds')
df = df.select_dtypes(include = ['float64', 'int64']) 
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]

In [None]:
aykiri_tf.head()

In [None]:
df_table[aykiri_tf]

In [None]:
df_table.mean()

In [None]:
df_table[aykiri_tf] = df_table.mean()

In [None]:
df_table[aykiri_tf]

### Baskılama Yöntemi

In [None]:
import seaborn as sns
df = sns.load_dataset('diamonds')
df = df.select_dtypes(include = ['float64', 'int64']) 
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]

In [None]:
df_table[aykiri_tf]

In [None]:
alt_sinir

In [None]:
df_table[aykiri_tf] = alt_sinir

In [None]:
df_table[aykiri_tf]

## Çok Değişkenli Aykırı Gözlem Analizi

### Local Outlier Factor

Gözlemleri bulundukları konumda yoğunluk tabanlı skorlayarak buna göre aykırı değer olabilecek değerleri tanımlayabilmemize imkan sağlıyor.

Bir noktanın local yoğunluğu bu noktanın komşuları ile karşılaştırılıyor. Eğer bir nokta komşularınının yoğunluğundan anlamlı şekilde düşük ise bu nokta komşularından daha seyrek bir bölgede bulunuyordur yorumu yapılabiliyor. Dolayısıyla burada bir komşuluk yapısı söz konusu. Bir değerin çevresi yoğun değilse demek ki bu değer aykırı değerdir şeklinde değerlendiriliyor.

In [2]:
import seaborn as sns
diamonds = sns.load_dataset('diamonds')
diamonds = diamonds.select_dtypes(include = ['float64', 'int64']) 
df = diamonds.copy()
df = df.dropna()
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [3]:
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

In [4]:
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)

In [5]:
clf.fit_predict(df)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


array([-1, -1, -1, ...,  1,  1,  1])

In [6]:
df_scores = clf.negative_outlier_factor_

In [7]:
df_scores[0:10]

array([-1.58352526, -1.59732899, -1.62278873, -1.33002541, -1.30712521,
       -1.28408436, -1.28428162, -1.26458706, -1.28422952, -1.27351342])

In [8]:
np.sort(df_scores)[0:20]

array([-8.60430658, -8.20889984, -5.86084355, -4.98415175, -4.81502092,
       -4.81502092, -4.61522833, -4.37081214, -4.29842288, -4.10492387,
       -4.0566648 , -4.01831733, -3.94882806, -3.82378797, -3.80135297,
       -3.75680919, -3.65947378, -3.59249261, -3.55564138, -3.47157375])

In [9]:
esik_deger = np.sort(df_scores)[13]

In [10]:
aykiri_tf = df_scores > esik_deger

In [11]:
aykiri_tf

array([ True,  True,  True, ...,  True,  True,  True])

In [12]:
### silme yöntemi

In [13]:
yeni_df  = df[df_scores > esik_deger]

In [14]:
yeni_df

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
5,0.24,62.8,57.0,336,3.94,3.96,2.48
6,0.24,62.3,57.0,336,3.95,3.98,2.47
7,0.26,61.9,55.0,337,4.07,4.11,2.53
8,0.22,65.1,61.0,337,3.87,3.78,2.49
9,0.23,59.4,61.0,338,4.00,4.05,2.39


In [15]:
df[df_scores < esik_deger]

Unnamed: 0,carat,depth,table,price,x,y,z
6341,1.0,44.0,53.0,4032,6.31,6.24,4.12
10377,1.09,43.0,54.0,4778,6.53,6.55,4.12
24067,2.0,58.9,57.0,12210,8.09,58.9,8.06
35633,0.29,62.8,44.0,474,4.2,4.24,2.65
36503,0.3,51.0,67.0,945,4.67,4.62,2.37
38840,0.73,70.8,55.0,1049,5.51,5.34,3.84
41918,1.03,78.2,54.0,1262,5.72,5.59,4.42
45688,0.7,71.6,55.0,1696,5.47,5.28,3.85
48410,0.51,61.8,54.7,1970,5.12,5.15,31.8
49189,0.51,61.8,55.0,2075,5.15,31.8,5.12


In [16]:
df[df_scores == esik_deger]

Unnamed: 0,carat,depth,table,price,x,y,z
31230,0.45,68.6,57.0,756,4.73,4.5,3.19


In [None]:
### baskilama

In [17]:
baski_deger = df[df_scores == esik_deger]

In [18]:
aykirilar = df[~aykiri_tf]

In [19]:
aykirilar

Unnamed: 0,carat,depth,table,price,x,y,z
6341,1.0,44.0,53.0,4032,6.31,6.24,4.12
10377,1.09,43.0,54.0,4778,6.53,6.55,4.12
24067,2.0,58.9,57.0,12210,8.09,58.9,8.06
31230,0.45,68.6,57.0,756,4.73,4.5,3.19
35633,0.29,62.8,44.0,474,4.2,4.24,2.65
36503,0.3,51.0,67.0,945,4.67,4.62,2.37
38840,0.73,70.8,55.0,1049,5.51,5.34,3.84
41918,1.03,78.2,54.0,1262,5.72,5.59,4.42
45688,0.7,71.6,55.0,1696,5.47,5.28,3.85
48410,0.51,61.8,54.7,1970,5.12,5.15,31.8


In [None]:
aykirilar.to_records(index = False)

In [None]:
res = aykirilar.to_records(index = False)

In [None]:
res[:] = baski_deger.to_records(index = False)

In [None]:
res

In [None]:
df[~aykiri_tf]

In [None]:
import pandas as pd
df[~aykiri_tf] = pd.DataFrame(res, index = df[~aykiri_tf].index)

In [None]:
df[~aykiri_tf]

# Eksik Veri Analizi

## Hızlı Çözüm

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.isnull().sum()

In [None]:
df.notnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.isnull()

In [None]:
df[df.isnull().any(axis = 1)]

In [None]:
df[df.notnull().all(axis = 1)]

In [None]:
df[df["V1"].notnull() & df["V2"].notnull()& df["V3"].notnull()]

In [None]:
#eksik degerlerin direk silinmesi

In [None]:
df.dropna()

In [None]:
df

In [None]:
#basit deger atama

In [None]:
df["V1"]

In [None]:
df["V1"].mean()

In [None]:
df["V1"].fillna(df["V1"].mean())

In [None]:
df["V2"].fillna(0)

In [None]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

## Eksik Değerlerin Saptanması

In [None]:
#değişkenlerdeki tam değer sayısı
df.notnull().sum()

In [None]:
#değişkenlerdeki eksik değer sayısı
df.isnull().sum()

In [None]:
#veri setindeki toplam eksik değer sayısı
df.isnull().sum().sum()

In [None]:
#en az bir eksik değere sahip gözlemler
df[df.isnull().any(axis=1)]

In [None]:
#tüm değerleri tam olan gözlemler
df[df.notnull().all(axis=1)]

## Eksik Veri Yapısının Görselleştirilmesi

In [None]:
!pip install missingno

In [None]:
import missingno as msno

In [None]:
msno.bar(df);

In [None]:
msno.matrix(df);

In [None]:
df

In [None]:
import seaborn as sns
df = sns.load_dataset('planets')
df.head()

In [None]:
df.isnull().sum()

In [None]:
msno.matrix(df);

In [None]:
msno.heatmap(df);

## Silme Yöntemleri

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.dropna()

In [None]:
df

In [None]:
df.dropna(how = "all")

In [None]:
df.dropna(axis = 1)

In [None]:
df.dropna(axis = 1, how = "all")

In [None]:
df["sil_beni"] = np.nan

In [None]:
df

In [None]:
df.dropna(axis = 1, how = "all")

In [None]:
df

In [None]:
df.dropna(axis = 1, how = "all", inplace = True)

In [None]:
df

## Değer Atama Yöntemleri

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
#sayısal degiskenlerde atama 

In [None]:
df["V1"].fillna(0)

In [None]:
df

In [None]:
df["V1"].fillna(df["V1"].mean())

In [None]:
#tum degiskenler icin birinci yol
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

In [None]:
#ikinci yol

In [None]:
df.fillna(df.mean()[:])

In [None]:
df.fillna(df.mean()["V1":"V2"])

In [None]:
df["V3"].fillna(df["V3"].median())

In [None]:
#ucuncu yol

In [None]:
df.where(pd.notna(df), df.mean(), axis = "columns")

## Kategorik Değişken Kırılımında Değer Atama

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT","IT","IK","IK","IK","IK","IK","IT","IT"])

df = pd.DataFrame(
        {"maas" : V1,
         "V2" : V2,
         "V3" : V3,
        "departman" : V4}        
)

df

In [None]:
df.groupby("departman")["maas"].mean()

In [None]:
df["maas"].fillna(df.groupby("departman")["maas"].transform("mean"))

## Kategorik Değişkenler için Eksik Değer Atama

In [1]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V4 = np.array(["IT",np.nan,"IK","IK","IK","IK","IK","IT","IT"], dtype=object)

df = pd.DataFrame(
        {"maas" : V1,
        "departman" : V4}        
)

df

Unnamed: 0,maas,departman
0,1.0,IT
1,3.0,
2,6.0,IK
3,,IK
4,7.0,IK
5,1.0,IK
6,,IK
7,9.0,IT
8,15.0,IT


In [2]:
df["departman"].mode()[0]

'IK'

In [3]:
df["departman"].fillna(df["departman"].mode()[0])

0    IT
1    IK
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

In [5]:
df

Unnamed: 0,maas,departman
0,1.0,IT
1,3.0,
2,6.0,IK
3,,IK
4,7.0,IK
5,1.0,IK
6,,IK
7,9.0,IT
8,15.0,IT


In [6]:
df["departman"].fillna(method = "bfill")

0    IT
1    IK
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

In [7]:
df["departman"].fillna(method = "ffill")

0    IT
1    IT
2    IK
3    IK
4    IK
5    IK
6    IK
7    IT
8    IT
Name: departman, dtype: object

## Tahmine Dayalı Değer Atama Yöntemleri

In [1]:
import seaborn as sns
import missingno as msno
df = sns.load_dataset('titanic')
df = df.select_dtypes(include = ['float64', 'int64'])
print(df.head())
df.isnull().sum()

   survived  pclass   age  sibsp  parch     fare
0         0       3  22.0      1      0   7.2500
1         1       1  38.0      1      0  71.2833
2         1       3  26.0      0      0   7.9250
3         1       1  35.0      1      0  53.1000
4         0       3  35.0      0      0   8.0500


survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

In [21]:
!pip install ycimpute

Collecting ycimpute
  Using cached ycimpute-0.2-py3-none-any.whl (35 kB)


ERROR: Could not find a version that satisfies the requirement torch>=1.1.0 (from ycimpute) (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)
ERROR: No matching distribution found for torch>=1.1.0 (from ycimpute)


In [22]:
from ycimpute.imputer import knnimput

ModuleNotFoundError: No module named 'ycimpute'

In [23]:
var_names = list(df)

In [24]:
import numpy as np
n_df = np.array(df)

In [25]:
n_df[0:10]

array([[2.30e-01, 6.15e+01, 5.50e+01, 3.26e+02, 3.95e+00, 3.98e+00,
        2.43e+00],
       [2.10e-01, 5.98e+01, 6.10e+01, 3.26e+02, 3.89e+00, 3.84e+00,
        2.31e+00],
       [2.30e-01, 5.69e+01, 6.50e+01, 3.27e+02, 4.05e+00, 4.07e+00,
        2.31e+00],
       [2.90e-01, 6.24e+01, 5.80e+01, 3.34e+02, 4.20e+00, 4.23e+00,
        2.63e+00],
       [3.10e-01, 6.33e+01, 5.80e+01, 3.35e+02, 4.34e+00, 4.35e+00,
        2.75e+00],
       [2.40e-01, 6.28e+01, 5.70e+01, 3.36e+02, 3.94e+00, 3.96e+00,
        2.48e+00],
       [2.40e-01, 6.23e+01, 5.70e+01, 3.36e+02, 3.95e+00, 3.98e+00,
        2.47e+00],
       [2.60e-01, 6.19e+01, 5.50e+01, 3.37e+02, 4.07e+00, 4.11e+00,
        2.53e+00],
       [2.20e-01, 6.51e+01, 6.10e+01, 3.37e+02, 3.87e+00, 3.78e+00,
        2.49e+00],
       [2.30e-01, 5.94e+01, 6.10e+01, 3.38e+02, 4.00e+00, 4.05e+00,
        2.39e+00]])

In [9]:
n_df.shape

(891, 6)

In [26]:
dff = knnimput.KNN(k = 4).complete(n_df)

NameError: name 'knnimput' is not defined

In [11]:
type(dff)

numpy.ndarray

In [13]:
import pandas as pd
dff = pd.DataFrame(dff, columns = var_names)

In [14]:
type(dff)

pandas.core.frame.DataFrame

In [15]:
dff.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

In [None]:
#random forests

In [16]:
import seaborn as sns
import missingno as msno
df = sns.load_dataset('titanic')
df = df.select_dtypes(include = ['float64', 'int64'])

In [17]:
df.isnull().sum()

survived      0
pclass        0
age         177
sibsp         0
parch         0
fare          0
dtype: int64

In [18]:
var_names = list(df)

In [19]:
import numpy as np
n_df = np.array(df)

In [20]:
from ycimpute.imputer import iterforest
dff = iterforest.IterImput().complete(n_df)



In [21]:
dff = pd.DataFrame(dff, columns = var_names)

In [22]:
dff.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

In [None]:
#EM 

In [23]:
import seaborn as sns
import missingno as msno
df = sns.load_dataset('titanic')
df = df.select_dtypes(include = ['float64', 'int64'])

In [24]:
from ycimpute.imputer import EM

In [25]:
var_names = list(df)

In [26]:
import numpy as np
n_df = np.array(df)

In [27]:
dff = EM().complete(n_df)

In [28]:
dff = pd.DataFrame(dff, columns = var_names)

In [29]:
dff.isnull().sum()

survived    0
pclass      0
age         0
sibsp       0
parch       0
fare        0
dtype: int64

# Değişken Standardizasyonu (Veri Standardizasyonu)

In [27]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,5,7])
V2 = np.array([7,7,5,8,12])
V3 = np.array([6,12,5,6,14])
df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3})

df = df.astype(float)
df

Unnamed: 0,V1,V2,V3
0,1.0,7.0,6.0
1,3.0,7.0,12.0
2,6.0,5.0,5.0
3,5.0,8.0,6.0
4,7.0,12.0,14.0


## Standardizasyon

In [28]:
from sklearn import preprocessing 

In [29]:
preprocessing.scale(df)

array([[-1.57841037, -0.34554737, -0.70920814],
       [-0.64993368, -0.34554737,  0.92742603],
       [ 0.74278135, -1.2094158 , -0.98198051],
       [ 0.27854301,  0.08638684, -0.70920814],
       [ 1.2070197 ,  1.81412369,  1.47297076]])

In [30]:
df

Unnamed: 0,V1,V2,V3
0,1.0,7.0,6.0
1,3.0,7.0,12.0
2,6.0,5.0,5.0
3,5.0,8.0,6.0
4,7.0,12.0,14.0


## Normalizasyon

In [31]:
preprocessing.normalize(df)

array([[0.10783277, 0.75482941, 0.64699664],
       [0.21107926, 0.49251828, 0.84431705],
       [0.64699664, 0.53916387, 0.53916387],
       [0.4472136 , 0.71554175, 0.53665631],
       [0.35491409, 0.60842415, 0.70982818]])

## Min-Max Dönüşümü

In [34]:
scaler = preprocessing.MinMaxScaler(feature_range = (100,200))

In [33]:
scaler.fit_transform(df)

array([[100.        , 128.57142857, 111.11111111],
       [133.33333333, 128.57142857, 177.77777778],
       [183.33333333, 100.        , 100.        ],
       [166.66666667, 142.85714286, 111.11111111],
       [200.        , 200.        , 200.        ]])

## Değişken Dönüşümleri

In [16]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### 0-1 Dönüşümü

In [17]:
from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()

In [20]:
lbe.fit_transform(df["sex"])

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0])

In [22]:
df["yeni_sex"] = lbe.fit_transform(df["sex"])

In [23]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,yeni_sex
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.50,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0
5,25.29,4.71,Male,No,Sun,Dinner,4,1
6,8.77,2.00,Male,No,Sun,Dinner,2,1
7,26.88,3.12,Male,No,Sun,Dinner,4,1
8,15.04,1.96,Male,No,Sun,Dinner,2,1
9,14.78,3.23,Male,No,Sun,Dinner,2,1


### "1 ve Diğerleri (0) " Dönüşümü

In [24]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,yeni_sex
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0


In [27]:
df["day"].str.contains("Sun")

0       True
1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17      True
18      True
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
214    False
215    False
216    False
217    False
218    False
219    False
220    False
221    False
222    False
223    False
224    False
225    False
226    False
227    False
228    False
229    False
230    False
231    False
232    False
233    False
234    False
235    False
236    False
237    False
238    False
239    False
240    False
241    False
242    False
243    False
Name: day, Length: 244, dtype: bool

In [25]:
import numpy as np 
df["yeni_day"] = np.where(df["day"].str.contains("Sun"), 1, 0)

In [26]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,yeni_sex,yeni_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1,1
2,21.01,3.50,Male,No,Sun,Dinner,3,1,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0,1
5,25.29,4.71,Male,No,Sun,Dinner,4,1,1
6,8.77,2.00,Male,No,Sun,Dinner,2,1,1
7,26.88,3.12,Male,No,Sun,Dinner,4,1,1
8,15.04,1.96,Male,No,Sun,Dinner,2,1,1
9,14.78,3.23,Male,No,Sun,Dinner,2,1,1


### Çok Sınıflı Dönüşüm

In [28]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()

In [29]:
lbe.fit_transform(df["day"])

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 3])

In [None]:
#cok dikkat 

### One-Hot Dönüşümü ve Dummy Değişken Tuzağı

In [38]:
df.head()

Unnamed: 0,V1,V2,V3
0,1.0,7.0,6.0
1,3.0,7.0,12.0
2,6.0,5.0,5.0
3,5.0,8.0,6.0
4,7.0,12.0,14.0


In [39]:
df_one_hot = pd.get_dummies(df, columns = ["sex"], prefix = ["sex"])

KeyError: "['sex'] not in index"

In [40]:
df_one_hot.head()

NameError: name 'df_one_hot' is not defined

In [41]:
pd.get_dummies(df, columns = ["day"], prefix = ["day"]).head()

KeyError: "['day'] not in index"

# Bölüm Sonu Değerlendirmesi

Veri Ön İşleme 

Aykırı Gözlem Analizi

Eksik Gözlem Analizi

Değişken Standartlaştırma

Değişken Dönüştürme