___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

<h1><p style="text-align: center;">Pandas Lesson, Session - 4</p><h1>
    

# Data Frames

 - ### ``DataFrames`` are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [1]:
import numpy as np
import pandas as pd

 ## Creating a DataFrame using the ``list``s of data and columns

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

Two-dimensional, size-mutable, potentially heterogeneous tabular data.

In [2]:
data = [1, 3, 5, 7, 9, 18]
data

[1, 3, 5, 7, 9, 18]

In [3]:
pd.DataFrame(data = data)

Unnamed: 0,0
0,1
1,3
2,5
3,7
4,9
5,18


In [4]:
pd.Series(data = data)

0     1
1     3
2     5
3     7
4     9
5    18
dtype: int64

In [5]:
pd.DataFrame(data = data, columns=["col_1"])

Unnamed: 0,col_1
0,1
1,3
2,5
3,7
4,9
5,18


 ## Creating a DataFrame using a ``NumPy Arrays``

In [6]:
m = np.arange(1,24,2).reshape(3,4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [7]:
df=pd.DataFrame(data=m, columns=['var1','var2','var3','var4'])  # Kaç tane column varsa o kadar sütun ismi yazmalıyız yoksa hata verir. columns yerine başka adlandırma yapınca çalışmadı dikkat!!
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [8]:
df.head(1)  # İçine yazdığımız sayı kadar satırı getirir

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7


In [9]:
df.head(2)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15


In [10]:
df.sample(2)  # 2 tane random satır getirir

Unnamed: 0,var1,var2,var3,var4
2,17,19,21,23
1,9,11,13,15


In [11]:
df.columns  # Sütun isimlerini getirir

Index(['var1', 'var2', 'var3', 'var4'], dtype='object')

In [12]:
for i in df.columns:  # Tek tek çağırıp işlem yaptırabiliriz
    print(i)

var1
var2
var3
var4


In [13]:
for i in df.columns:  # Tek tek çağırıp işlem yaptırabiliriz
    print(df[i].sum())

27
33
39
45


In [14]:
df.columns=["new1", "new2", "new3", "new4"]  # Sütunların ismini değiştirdik
df

Unnamed: 0,new1,new2,new3,new4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [15]:
df.index=["a", "b", "c"]  
df

Unnamed: 0,new1,new2,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [16]:
df.rename(columns = {"new1": "a", "new2": "b"})

Unnamed: 0,a,b,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [17]:
df.rename(index= {"a": 1, "b": 2})

Unnamed: 0,new1,new2,new3,new4
1,1,3,5,7
2,9,11,13,15
c,17,19,21,23


In [18]:
df

Unnamed: 0,new1,new2,new3,new4
a,1,3,5,7
b,9,11,13,15
c,17,19,21,23


In [19]:
# Bir değişikliğin kalıcı olmasını istiyorsak inplace=True deriz.

In [20]:
df.shape

(3, 4)

In [21]:
df.shape[1]

4

In [22]:
df.ndim

2

In [23]:
df.size

12

In [24]:
len(df)  # Row sayısını verir

3

In [25]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [26]:
type(df)

pandas.core.frame.DataFrame

In [27]:
type(df.values)

numpy.ndarray

 ## Creating a DataFrame using a ``dict``

In [28]:
s1 = np.random.randint(2,10, size = 4)
s2 = np.random.randint(3,10, size = 4)
s3 = np.random.randint(4,15, size = 4)

In [29]:
s1

array([5, 3, 3, 8])

In [30]:
s2

array([6, 9, 8, 6])

In [31]:
s3

array([ 5,  7, 11,  9])

In [32]:
my_dict = {"var1":s1, "var2":s2, "var3":s3}
my_dict

{'var1': array([5, 3, 3, 8]),
 'var2': array([6, 9, 8, 6]),
 'var3': array([ 5,  7, 11,  9])}

In [33]:
df1 = pd.DataFrame(my_dict)
df1

Unnamed: 0,var1,var2,var3
0,5,6,5
1,3,9,7
2,3,8,11
3,8,6,9


In [34]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [35]:
[i for i in df1.index]

[0, 1, 2, 3]

In [36]:
"var2" in df1  # İşlemler açısından seri'deki index dataframe'deki sütunlar olarak düşünebiliriz

True

### Now, let's examine again the ***idexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [37]:
from numpy.random import randn

In [38]:
np.random.seed(101)  # 
df3 = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())  # columns ve index yerine başka isimler denediğimde çalışmıyor dikkat!!
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [39]:
df3["W"]

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

#### DataFrame Columns are just Series

In [40]:
type(df3["W"])  # Seri tipindedir

pandas.core.series.Series

In [41]:
df3[["W"]]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695
E,0.190794


In [42]:
type(df3[["W"]])  # Dataframe tipindedir

pandas.core.frame.DataFrame

In [43]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [44]:
df3[["W", "Z"]]  # İki sütun birden çağırma

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [45]:
df3["W":"Y"]  # Dikkat yanlış bir kullanım!!! Çünkü row çağırdık olarak düşünüp tüm sütunları getiriyor

Unnamed: 0,W,X,Y,Z


In [46]:
df3["A":"C"]  # A, B ve C rowlarını getirir

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001


In [47]:
df3["A":"C"]["W"]

A    2.706850
B    0.651118
C   -2.018168
Name: W, dtype: float64

In [48]:
df3["A":"C"][["W","Y"]]  # A, B, C satırları ve W, Y sütunlarını çağırdık

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813


**Creating a new column:**

In [49]:
df3["new"] = df3["X"] * df3["Y"]
df3

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [50]:
df3["new2"] = [1,2,3,4,5]
df3

Unnamed: 0,W,X,Y,Z,new,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [51]:
df3 = df3[["new", "new2", "W", "X", "Y", "Z"]]
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


## [Removing Columns & Rows](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-drop.ipynb)

 ### Removing Columns

In [52]:
df3.drop("new2", axis=1)  # new2 sütununu sildik

Unnamed: 0,new,W,X,Y,Z
A,0.570325,2.70685,0.628133,0.907969,0.503826
B,0.270806,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,0.190794,1.978757,2.605967,0.683509


In [53]:
df3  # Kalıcı bir değişiklik yapmadı (drop komutu)

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [54]:
df3.drop(["new", "new2"], axis=1)  # axis=1 sütun, axis=  satır

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [55]:
# df3.drop(["new", "new2"], axis=1, inplace=True)  # Değişikliği kalıcı hale getirir.

 ### Removing rows

In [56]:
df3.drop("C", axis=0)

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [57]:
df4 = df3.drop("C", axis=0)  # Değişikliği kalıcı hale getirmenin diğer yolu
df4

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


## Selecting Rows

### First, let's take a quick look at [`.loc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-loc.ipynb) | [`.iloc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-iloc.ipynb)

#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [68]:
m = np.random.randint(1,40, size=(8,4))
df4 = pd.DataFrame(m, columns = ["var1","var2","var3",'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,12,11,31,36
1,29,4,20,21
2,15,6,6,7
3,25,38,8,38
4,5,24,36,16
5,35,4,19,14
6,4,38,30,23
7,22,22,18,24


In [69]:
df4.loc[4]  # 4. satırı getirir  # loc index ve columnların name'lerine göre çalışır.

var1     5
var2    24
var3    36
var4    16
Name: 4, dtype: int32

In [70]:
df4.loc[[4]]  # Köşeli parantezle yaparsak dataframe mantığında döndürür.

Unnamed: 0,var1,var2,var3,var4
4,5,24,36,16


In [71]:
df4.loc[2:5]  # loc kullanıldığında index numaralarına göre değil ilk sütunda tuttuğu sayıya göre döndürür

Unnamed: 0,var1,var2,var3,var4
2,15,6,6,7
3,25,38,8,38
4,5,24,36,16
5,35,4,19,14


In [72]:
df4.iloc[2:5]  # iloc kullanıldığında index numaralarına göre döndürür

Unnamed: 0,var1,var2,var3,var4
2,15,6,6,7
3,25,38,8,38
4,5,24,36,16


In [90]:
df4.index='a b c d e f g h'.split()  # index dediği satırların başlığı, columns dediği sütunların başlığı. Bu isimler standart değiştirilmez.
df4

Unnamed: 0,var1,var2,var3,var4
a,12,11,31,36
b,29,4,20,21
c,15,6,6,7
d,25,38,8,38
e,5,24,36,16
f,35,4,19,14
g,4,38,30,23
h,22,22,18,24


In [91]:
df4.iloc[1:4]  # iloc arka planda tuttuğu index numaralarına göre işlem yapar. loc ise index'in label'ına bakar.

Unnamed: 0,var1,var2,var3,var4
b,29,4,20,21
c,15,6,6,7
d,25,38,8,38


In [92]:
# Dataframe istersem iki parantez kullanırım. Üstteki output bir dataframe.

In [93]:
# df4.loc[1:4] # Hata verir. Çünkü labellarım sayı değil, harf. Gördüğünü çevirir. İlk sütunun değerleri sayı olmadığı için hata verir.

In [95]:
df4.loc["a":"c"]

Unnamed: 0,var1,var2,var3,var4
a,12,11,31,36
b,29,4,20,21
c,15,6,6,7


In [None]:
# df4.iloc["a":"c"]  # iloc olursa int değer vermek zorundayız. Yoksa hata olur. (i)loc kısmında ki i harfini int dan aklımızda tutabiliriz.

In [97]:
df4

Unnamed: 0,var1,var2,var3,var4
a,12,11,31,36
b,29,4,20,21
c,15,6,6,7
d,25,38,8,38
e,5,24,36,16
f,35,4,19,14
g,4,38,30,23
h,22,22,18,24


In [98]:
df4.iloc[3,1]  # 38 değerini getirmek için indexlere bakarak yazdık

38

In [99]:
df4.loc["d","var2"]  # [row, column]

38

In [100]:
df4.loc["d":"g","var3"]  

d     8
e    36
f    19
g    30
Name: var3, dtype: int32

In [102]:
df4.loc["d":"g"]["var3"]  

d     8
e    36
f    19
g    30
Name: var3, dtype: int32

In [107]:
df4.loc["d":"g"][["var3"]]  # Dataframe döndürür. [] kullandığımız için.

Unnamed: 0,var3
d,8
e,36
f,19
g,30


In [109]:
df4.loc["d":"g", ["var3"]]

Unnamed: 0,var3
d,8
e,36
f,19
g,30


In [113]:
df4.iloc[2:5,2]

c     6
d     8
e    36
Name: var3, dtype: int32

In [115]:
df4.iloc[2:5][["var2"]]

Unnamed: 0,var2
c,6
d,38
e,24


#### Let's continue to examine `.loc[]` and `.iloc[]` using ``df3`` again

In [116]:
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [117]:
df3.loc["C"]

new     0.391387
new2    3.000000
W      -2.018168
X       0.740122
Y       0.528813
Z      -0.589001
Name: C, dtype: float64

In [118]:
df3.iloc[2]

new     0.391387
new2    3.000000
W      -2.018168
X       0.740122
Y       0.528813
Z      -0.589001
Name: C, dtype: float64

In [119]:
df3.loc[["C"]]

Unnamed: 0,new,new2,W,X,Y,Z
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001


In [120]:
df3.iloc[[2]]

Unnamed: 0,new,new2,W,X,Y,Z
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001


### Selecting subset of rows and columns

 - ### `.loc[[row labels|names], [column labels|names]]`

 - ### `.iloc[[row index numbers], [column index numbers]]`

In [123]:
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [125]:
df3.loc["C","Z"]

-0.5890005332865824

In [128]:
df3.loc[["C"],["Z"]]

Unnamed: 0,Z
C,-0.589001


In [129]:
df3.loc[["A", "C"], ["X", "Z"]]  

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001


In [131]:
df3.iloc[[0,2], [0,3]]

Unnamed: 0,new,X
A,0.570325,0.628133
C,0.391387,0.740122


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [132]:
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [133]:
df3>2  # df3'ün 2'den büyük olduğu yerler True, diğer yerleri False döndürür

Unnamed: 0,new,new2,W,X,Y,Z
A,False,False,True,False,False,False
B,False,False,False,False,False,False
C,False,True,False,False,False,False
D,False,True,False,False,False,False
E,True,True,False,False,True,False


In [134]:
df3[df3>2]  # True olan yerlerdeki değerleri gösterir

Unnamed: 0,new,new2,W,X,Y,Z
A,,,2.70685,,,
B,,,,,,
C,,3.0,,,,
D,,4.0,,,,
E,5.156577,5.0,,,2.605967,


In [135]:
df3[df3["Z"]>0.5]  # Z sütununda sadece 0.5'den büyük olan değerleri getirecek şekilde  döndürdü. Bu nedenle C satırı yok.

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [136]:
df3[df3["Z"]>0.5]["X"]

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [137]:
df3[df3["Z"]>0.5][["X"]]

Unnamed: 0,X
A,0.628133
B,-0.319318
D,-0.758872
E,1.978757


#### For two conditions you can use **|** → `or`,  **&** →  `and` with parenthesis:

In [138]:
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


In [139]:
df3[(df3['W']>0) & (df3['Y']<1)]

Unnamed: 0,new,new2,W,X,Y,Z
A,0.570325,1,2.70685,0.628133,0.907969,0.503826
B,0.270806,2,0.651118,-0.319318,-0.848077,0.605965
D,0.708208,4,0.188695,-0.758872,-0.933237,0.955057


In [144]:
df3[(df3['W']>0) & (df3['Y']<1)] = 0  # Belirttiğimiz değerlere 0 değerini atadık
df3

Unnamed: 0,new,new2,W,X,Y,Z
A,0.0,0,0.0,0.0,0.0,0.0
B,0.0,0,0.0,0.0,0.0,0.0
C,0.391387,3,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0,0.0,0.0,0.0,0.0
E,5.156577,5,0.190794,1.978757,2.605967,0.683509


### Conditional selection using ``.loc[]`` and ``.iloc[]``

In [145]:
np.random.seed(101)
df3 = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [146]:
df3.loc[(df3.X>0), ["X", "Y"]]  # şimdi de loc ile yaptık

Unnamed: 0,X,Y
A,0.628133,0.907969
C,0.740122,0.528813
E,1.978757,2.605967


In [148]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['X','Z']]  # İki tane conditionallı yaptık

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001
D,-0.758872,0.955057


In [150]:
df3.loc[((df3.W>1) | (df3.Y<1)), ['X','Z']]  = 1  # Atama yaptık
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,1.0,0.907969,1.0
B,0.651118,1.0,-0.848077,1.0
C,-2.018168,1.0,0.528813,1.0
D,0.188695,1.0,-0.933237,1.0
E,0.190794,1.978757,2.605967,0.683509


## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [151]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,1.0,0.907969,1.0
B,0.651118,1.0,-0.848077,1.0
C,-2.018168,1.0,0.528813,1.0
D,0.188695,1.0,-0.933237,1.0
E,0.190794,1.978757,2.605967,0.683509


In [153]:
df3.reset_index()  # Dataframe'de ilk sütundaki indexlerin (A,B,C,D,E) yerine kendi indexleme yaptı (0,1,2,3,4)

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,1.0,0.907969,1.0
1,B,0.651118,1.0,-0.848077,1.0
2,C,-2.018168,1.0,0.528813,1.0
3,D,0.188695,1.0,-0.933237,1.0
4,E,0.190794,1.978757,2.605967,0.683509


In [154]:
df3.reset_index(drop = True)  # İndex sütununu sildik ve yalnızca üstteki indexlemenin kalmasını sağladık

Unnamed: 0,W,X,Y,Z
0,2.70685,1.0,0.907969,1.0
1,0.651118,1.0,-0.848077,1.0
2,-2.018168,1.0,0.528813,1.0
3,0.188695,1.0,-0.933237,1.0
4,0.190794,1.978757,2.605967,0.683509


In [155]:
df3.set_index("Z")  # Z sütununu index haline getirdik. Değişikliği kalıcı olarak yapmaz.

Unnamed: 0_level_0,W,X,Y
Z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,2.70685,1.0,0.907969
1.0,0.651118,1.0,-0.848077
1.0,-2.018168,1.0,0.528813
1.0,0.188695,1.0,-0.933237
0.683509,0.190794,1.978757,2.605967


In [157]:
df3.reset_index(drop=True, inplace=True)  # İndex değişikliğini yaptı ve eski index sütununu attı. inplace=True dediğimiz için de değişiklik kalıcı hale geldi.
df3

Unnamed: 0,W,X,Y,Z
0,2.70685,1.0,0.907969,1.0
1,0.651118,1.0,-0.848077,1.0
2,-2.018168,1.0,0.528813,1.0
3,0.188695,1.0,-0.933237,1.0
4,0.190794,1.978757,2.605967,0.683509


In [160]:
df3  # inplace=True dediğimiz için değişiklik kalıcı oldu

Unnamed: 0,W,X,Y,Z
0,2.70685,1.0,0.907969,1.0
1,0.651118,1.0,-0.848077,1.0
2,-2.018168,1.0,0.528813,1.0
3,0.188695,1.0,-0.933237,1.0
4,0.190794,1.978757,2.605967,0.683509


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [2]:
# Index Levels
outside = ['M1', 'M1', 'M1', 'M2', 'M2', 'M2','M3', 'M3', 'M3']
inside = [1, 2, 3, 1, 2, 3, 5, 6, 7]
multi_index = list(zip(outside, inside))
multi_index  

[('M1', 1),
 ('M1', 2),
 ('M1', 3),
 ('M2', 1),
 ('M2', 2),
 ('M2', 3),
 ('M3', 5),
 ('M3', 6),
 ('M3', 7)]

In [3]:
hier_index=pd.MultiIndex.from_tuples(multi_index)

In [4]:
df=pd.DataFrame(np.random.randn(9,4), index = hier_index, columns=['A','B','C','D'])  # İndex'lerimizi (satırlar) ve column'larımızı (sütunlar) belirttik ve bir dataframe oluşturduk.
df  

Unnamed: 0,Unnamed: 1,A,B,C,D
M1,1,0.330884,1.51759,-0.730905,0.697123
M1,2,-0.388579,0.566543,-0.268668,-0.805886
M1,3,-0.60366,-1.031962,0.305158,-0.022935
M2,1,-0.607576,0.927547,1.866629,0.180731
M2,2,1.320468,-0.027368,-0.621695,-0.650515
M2,3,0.839227,-0.279267,-0.487259,0.584501
M3,5,1.573225,0.744275,0.723667,0.772572
M3,6,-0.289975,1.662345,-0.574503,-0.027542
M3,7,-1.944851,-0.82777,-2.3017,0.276485


In [5]:
df.loc["M1"]

Unnamed: 0,A,B,C,D
1,0.330884,1.51759,-0.730905,0.697123
2,-0.388579,0.566543,-0.268668,-0.805886
3,-0.60366,-1.031962,0.305158,-0.022935


In [6]:
df.loc["M1"].loc[2]

A   -0.388579
B    0.566543
C   -0.268668
D   -0.805886
Name: 2, dtype: float64

In [7]:
df.loc["M1"].loc[[2]]  # Üsttekini dataframe halinde çektik

Unnamed: 0,A,B,C,D
2,-0.388579,0.566543,-0.268668,-0.805886


In [8]:
df.index.names  # İndexlerin isimlerini gösterir (Bizimkilerin ismi olmadığı için None döndürdü)

FrozenList([None, None])

In [9]:
df.index.names = ["Group", "Num"]  # İndexlerimize isim verdik
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,0.330884,1.51759,-0.730905,0.697123
M1,2,-0.388579,0.566543,-0.268668,-0.805886
M1,3,-0.60366,-1.031962,0.305158,-0.022935
M2,1,-0.607576,0.927547,1.866629,0.180731
M2,2,1.320468,-0.027368,-0.621695,-0.650515
M2,3,0.839227,-0.279267,-0.487259,0.584501
M3,5,1.573225,0.744275,0.723667,0.772572
M3,6,-0.289975,1.662345,-0.574503,-0.027542
M3,7,-1.944851,-0.82777,-2.3017,0.276485


### let's take a quick look at the [``.xs()``](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-xs.ipynb)

In [15]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,0.330884,1.51759,-0.730905,0.697123
M1,2,-0.388579,0.566543,-0.268668,-0.805886
M1,3,-0.60366,-1.031962,0.305158,-0.022935
M2,1,-0.607576,0.927547,1.866629,0.180731
M2,2,1.320468,-0.027368,-0.621695,-0.650515
M2,3,0.839227,-0.279267,-0.487259,0.584501
M3,5,1.573225,0.744275,0.723667,0.772572
M3,6,-0.289975,1.662345,-0.574503,-0.027542
M3,7,-1.944851,-0.82777,-2.3017,0.276485


In [14]:
df.xs("M1")  

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.330884,1.51759,-0.730905,0.697123
2,-0.388579,0.566543,-0.268668,-0.805886
3,-0.60366,-1.031962,0.305158,-0.022935


In [11]:
df.loc["M1"]  # Üstteki işlemi loc ile yaptık

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.330884,1.51759,-0.730905,0.697123
2,-0.388579,0.566543,-0.268668,-0.805886
3,-0.60366,-1.031962,0.305158,-0.022935


In [21]:
df.loc["M1"].loc[2]

A   -0.388579
B    0.566543
C   -0.268668
D   -0.805886
Name: 2, dtype: float64

In [12]:
df.xs(("M1",2))  # Üstteki gibi iki kere loc yapmak yerine böyle yazmak daha pratik

A   -0.388579
B    0.566543
C   -0.268668
D   -0.805886
Name: (M1, 2), dtype: float64

In [25]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,0.330884,1.51759,-0.730905,0.697123
M1,2,-0.388579,0.566543,-0.268668,-0.805886
M1,3,-0.60366,-1.031962,0.305158,-0.022935
M2,1,-0.607576,0.927547,1.866629,0.180731
M2,2,1.320468,-0.027368,-0.621695,-0.650515
M2,3,0.839227,-0.279267,-0.487259,0.584501
M3,5,1.573225,0.744275,0.723667,0.772572
M3,6,-0.289975,1.662345,-0.574503,-0.027542
M3,7,-1.944851,-0.82777,-2.3017,0.276485


In [13]:
df.xs(key=("M1",2), level=[0,1]) 

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,-0.388579,0.566543,-0.268668,-0.805886


In [26]:
df.xs(key=("M3", 5), level=[0,1])  # 0, 1 yazdığımız şeylerin (M3 ve 5) index numaralarıdır

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M3,5,1.573225,0.744275,0.723667,0.772572


In [16]:
df.xs(key=("M1",2), level=["Group","Num"])  # Üstteki gibi index numaralarını da yazabiliriz. Direkt isim de yazabiliriz.

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,-0.388579,0.566543,-0.268668,-0.805886


In [27]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,0.330884,1.51759,-0.730905,0.697123
M1,2,-0.388579,0.566543,-0.268668,-0.805886
M1,3,-0.60366,-1.031962,0.305158,-0.022935
M2,1,-0.607576,0.927547,1.866629,0.180731
M2,2,1.320468,-0.027368,-0.621695,-0.650515
M2,3,0.839227,-0.279267,-0.487259,0.584501
M3,5,1.573225,0.744275,0.723667,0.772572
M3,6,-0.289975,1.662345,-0.574503,-0.027542
M3,7,-1.944851,-0.82777,-2.3017,0.276485


In [28]:
#df.loc[2]  # Hata verir. Aradaki bir değeri loc ile çekemem
#df.xs(2)  # Hata  verir
df.xs(2, level="Num")  # Aradaki bir satırı çektim. num'ı 2 olan satırları getirdi

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,-0.388579,0.566543,-0.268668,-0.805886
M2,1.320468,-0.027368,-0.621695,-0.650515


In [18]:
df.xs(5, level=1)  # 1.index'i yani Num'ı 5 olanı getirir

Unnamed: 0_level_0,A,B,C,D
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3,1.573225,0.744275,0.723667,0.772572


In [19]:
df["C"]  # C sütununu getirir

Group  Num
M1     1     -0.730905
       2     -0.268668
       3      0.305158
M2     1      1.866629
       2     -0.621695
       3     -0.487259
M3     5      0.723667
       6     -0.574503
       7     -2.301700
Name: C, dtype: float64

In [20]:
df.xs("C", axis=1)  # Ekstra bilgi olarak sçyledi. Üstteki daha pratik.

Group  Num
M1     1     -0.730905
       2     -0.268668
       3      0.305158
M2     1      1.866629
       2     -0.621695
       3     -0.487259
M3     5      0.723667
       6     -0.574503
       7     -2.301700
Name: C, dtype: float64

### Let's learn new functions/attributes/methods on "iris dataset" 

In [29]:
import seaborn as sns  # Burada eğitimimiz için bazı datasetleri var. O yüzden import ettik.

In [30]:
df = sns.load_dataset("iris")  # iris datasetini yükledik
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [31]:
df.head()  # Head'in default'u 5 satırdır.

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [32]:
df.head(8)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa


In [33]:
df.shape  # 150 satır, 5 sütun

(150, 5)

In [34]:
df.info()  # Dataframe hakkında özet bilgi verir

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [35]:
df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
30,4.8,3.1,1.6,0.2,setosa
128,6.4,2.8,5.6,2.1,virginica
86,6.7,3.1,4.7,1.5,versicolor
10,5.4,3.7,1.5,0.2,setosa
89,5.5,2.5,4.0,1.3,versicolor


In [51]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [36]:
df.describe()  # Default olarak numeric sütunları getirir

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [37]:
df.describe().transpose() #".T"  # Sütunları index, indexleri sütun olarak aldık

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [38]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [39]:
df.describe(include="all") # "number" and "object" can be used as include/exclude parameter
# Tüm değerleri (kategorik ve numerik) hepsini gösterir

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [42]:
df.describe(include="object")  # Sadece kategorik sürunlar hakkındaki bilgileri getirir

Unnamed: 0,species
count,150
unique,3
top,setosa
freq,50


In [46]:
df.describe(include="number")  # Sadece numerik sütunlar hakkındaki bilgileri getirir

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [40]:
df.corr()  # Korelasyona baktık

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [47]:
df.corr()[["sepal_length"]]  # Sadece sepal_length'in korelasyonuna baktık

Unnamed: 0,sepal_length
sepal_length,1.0
sepal_width,-0.11757
petal_length,0.871754
petal_width,0.817941


In [56]:
df["sepal_length"].corr(df["petal_width"])  # İkisinin birbiriyle korelasyonuna baktık

0.8179411262715757

In [50]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [49]:
df["species"].value_counts()  # Her bir değeri ve sayılarını verdi

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [58]:
df["species"].value_counts(dropna=False)  # NaN value olsaydı onların da sayısını verirdi (dropna=False dediğmiz için).

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [59]:
df["species"].value_counts(dropna=False, normalize = True)  # Sütun içindeki yüzdelerini verir (normalize=True dediğimiz için) (Hepsi eşit olduğu için aynı değeri vermiş)

setosa        0.333333
versicolor    0.333333
virginica     0.333333
Name: species, dtype: float64

In [60]:
df.species.nunique()  # number of unique (Kaç tane unique değer olduğunu gösterir)

3

In [61]:
df.species.unique()  # Unique değerleri verir

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [63]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [62]:
df.loc[df["species"]=="setosa", "sepal_length"]  # species sütununda setosa olanların sepal_length değerlerini getirir

0     5.1
1     4.9
2     4.7
3     4.6
4     5.0
5     5.4
6     4.6
7     5.0
8     4.4
9     4.9
10    5.4
11    4.8
12    4.8
13    4.3
14    5.8
15    5.7
16    5.4
17    5.1
18    5.7
19    5.1
20    5.4
21    5.1
22    4.6
23    5.1
24    4.8
25    5.0
26    5.0
27    5.2
28    5.2
29    4.7
30    4.8
31    5.4
32    5.2
33    5.5
34    4.9
35    5.0
36    5.5
37    4.9
38    4.4
39    5.1
40    5.0
41    4.5
42    4.4
43    5.0
44    5.1
45    4.8
46    5.1
47    4.6
48    5.3
49    5.0
Name: sepal_length, dtype: float64

In [65]:
df.loc[(df.sepal_length>4) & (df.sepal_length<5)]  # sepal_length değeri 4 ile 5 arasında olanları getirir

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
11,4.8,3.4,1.6,0.2,setosa
12,4.8,3.0,1.4,0.1,setosa
13,4.3,3.0,1.1,0.1,setosa
22,4.6,3.6,1.0,0.2,setosa


In [66]:
df.loc[(df.species == "virginica") & (df.sepal_length>4)  & (df.sepal_length<5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
106,4.9,2.5,4.5,1.7,virginica


In [68]:
df.sort_values(by = 'sepal_length', ascending = True)  # sepal_length sütunundaki değerleri artana göre sıraladı

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
42,4.4,3.2,1.3,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,virginica
118,7.7,2.6,6.9,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica
135,7.7,3.0,6.1,2.3,virginica


# End of the Session