### Building DataFrames

In [1]:
import pandas as pd

#### 1. From Dict

In [2]:
data = {"name" : ["Ali", "Behcet", "Cemal", "Davut"],
       "city": ["Rome", "Ankara", "London", "Paris"],
        "age": [60,90,80,40]
        }

users = pd.DataFrame(data)
users

Unnamed: 0,name,city,age
0,Ali,Rome,60
1,Behcet,Ankara,90
2,Cemal,London,80
3,Davut,Paris,40


#### 2. From Dict of Lists

In [3]:
names =  ["Ali", "Behcet", "Cemal", "Davut"]
cities =  ["Rome", "Ankara", "London", "Paris"]
ages = [60,90,80,40]

list_labels = ["name", "city", "age"]
list_col_entries = [names, cities, ages]

# zipped of tuples (column names and columns)
zipped = list(zip(list_labels, list_col_entries))
zipped

[('name', ['Ali', 'Behcet', 'Cemal', 'Davut']),
 ('city', ['Rome', 'Ankara', 'London', 'Paris']),
 ('age', [60, 90, 80, 40])]

In [4]:
data_1 = dict(zipped)
data_1

{'name': ['Ali', 'Behcet', 'Cemal', 'Davut'],
 'city': ['Rome', 'Ankara', 'London', 'Paris'],
 'age': [60, 90, 80, 40]}

In [5]:
users_1 = pd.DataFrame(data_1)
users_1

Unnamed: 0,name,city,age
0,Ali,Rome,60
1,Behcet,Ankara,90
2,Cemal,London,80
3,Davut,Paris,40


#### Broadcasting

In [6]:
# broadcats to entire column

users["wages"] = 0
users

Unnamed: 0,name,city,age,wages
0,Ali,Rome,60,0
1,Behcet,Ankara,90,0
2,Cemal,London,80,0
3,Davut,Paris,40,0


#### Broadcasting with a dict

In [7]:
# gender has just 1 value

f_names = ['Ali','Veli','Cem']
l_names = ['İlk','Yedi','Sekiz']
data = {'first': f_names, 
         'last': l_names,
         "gender" : "M"   # broadcasts "gender column to "M" for all rows
         }

people = pd.DataFrame(data)
people

Unnamed: 0,first,last,gender
0,Ali,İlk,M
1,Veli,Yedi,M
2,Cem,Sekiz,M


#### change column and index labels

In [8]:
# They should have the same length

people.columns = ["first_name", "last_name", "gender"]
people.index = ["a", "b", "c"]
people

Unnamed: 0,first_name,last_name,gender
a,Ali,İlk,M
b,Veli,Yedi,M
c,Cem,Sekiz,M


#### reading file, ignore header, assign column names

In [9]:
labels = ["AGE", "ALL_DEVS", "PYTHON", "JAVASCRIPT"]
df = pd.read_csv("Data/dev_salaries.csv", header = 0, names = labels)
df.head()


Unnamed: 0,AGE,ALL_DEVS,PYTHON,JAVASCRIPT
0,18,17784,20046,16446
1,19,16500,17100,16791
2,20,18012,20000,18942
3,21,20628,24744,21780
4,22,25206,30500,25704


#### select columns with all non-zeros

In [10]:
import numpy as np

users["grade"] = np.nan
users["height"] = [180, 0, 170, np.nan]
users["weight"] = [80, 60, 70, np.nan]
users

Unnamed: 0,name,city,age,wages,grade,height,weight
0,Ali,Rome,60,0,,180.0,80.0
1,Behcet,Ankara,90,0,,0.0,60.0
2,Cemal,London,80,0,,170.0,70.0
3,Davut,Paris,40,0,,,


In [16]:
# select columns with all non zeros
# if a cloumn contains a single "0" value, it is excluded!!!
# be careful, it can contain NaN !!!
# don't mutate the original dataframe

users.loc[:, users.all()]

Unnamed: 0,name,city,age,grade,weight
0,Ali,Rome,60,,80.0
1,Behcet,Ankara,90,,60.0
2,Cemal,London,80,,70.0
3,Davut,Paris,40,,


In [12]:
# select columns with any non zeros
# only wage column is excluded which contains all zeros!!!
# don't mutate the original dataframe

users.loc[:, users.any()]

Unnamed: 0,name,city,age,height,weight
0,Ali,Rome,60,180.0,80.0
1,Behcet,Ankara,90,0.0,60.0
2,Cemal,London,80,170.0,70.0
3,Davut,Paris,40,,


In [14]:
# select columns with all NaNs
# only grade column is selected which contains all Nans!!!

users.loc[:, users.isnull().all()]

Unnamed: 0,grade
0,
1,
2,
3,


In [17]:
# select columns with any NaNs
# every column which contains a single Nan value is selected!!!

users.loc[:, users.isnull().any()]

Unnamed: 0,grade,height,weight
0,,180.0,80.0
1,,0.0,60.0
2,,170.0,70.0
3,,,


In [18]:
# exclude columns with any NaNs
# columns contain any Nans excluded!!!

users.loc[:, users.notnull().all()]

Unnamed: 0,name,city,age,wages
0,Ali,Rome,60,0
1,Behcet,Ankara,90,0
2,Cemal,London,80,0
3,Davut,Paris,40,0


In [19]:
# exclude columns with all NaNs
# columns contain all Nans excluded!!!

users.loc[:, users.notnull().any()]

Unnamed: 0,name,city,age,wages,height,weight
0,Ali,Rome,60,0,180.0,80.0
1,Behcet,Ankara,90,0,0.0,60.0
2,Cemal,London,80,0,170.0,70.0
3,Davut,Paris,40,0,,


### remove rows and columns with missing data

In [47]:
users

Unnamed: 0,name,city,age,wages,grade,height,weight
0,Ali,Rome,60,0,,180.0,80.0
1,Behcet,Ankara,90,0,,0.0,60.0
2,Cemal,London,80,0,,170.0,70.0
3,Davut,Paris,40,0,,,


In [51]:
# exclude rows with any NaNs

users.loc[0, "grade"] = "A"
users.dropna(how="any")

Unnamed: 0,name,city,age,wages,grade,height,weight
0,Ali,Rome,60,0,A,180.0,80.0


In [55]:
users.loc[0:1, "grade"] = "B"
users

Unnamed: 0,name,city,age,wages,grade,height,weight
0,Ali,Rome,60,0,B,180.0,80.0
1,Behcet,Ankara,90,0,B,0.0,60.0
2,Cemal,London,80,0,,170.0,70.0
3,Davut,Paris,40,0,,,


In [58]:
# exclude columns with less than a defined threshold number

users.dropna(thresh=3, axis="columns")

Unnamed: 0,name,city,age,wages,height,weight
0,Ali,Rome,60,0,180.0,80.0
1,Behcet,Ankara,90,0,0.0,60.0
2,Cemal,London,80,0,170.0,70.0
3,Davut,Paris,40,0,,


### Transforming DataFrames

In [61]:
# to create DataFrame with Random Values N x M

df = pd.DataFrame(np.random.randint(0,100,size=(10, 5)), columns=list('ABCDF'))
df

Unnamed: 0,A,B,C,D,F
0,74,22,56,1,0
1,50,29,64,62,22
2,43,10,41,89,45
3,4,92,87,1,86
4,27,52,97,69,59
5,9,9,7,46,80
6,94,26,97,54,65
7,2,48,54,43,16
8,22,99,60,19,43
9,18,54,16,46,62


In [62]:
# add some NaN values

df.loc[5:7, "C":"D"] = np.nan
df

Unnamed: 0,A,B,C,D,F
0,74,22,56.0,1.0,0
1,50,29,64.0,62.0,22
2,43,10,41.0,89.0,45
3,4,92,87.0,1.0,86
4,27,52,97.0,69.0,59
5,9,9,,,80
6,94,26,,,65
7,2,48,,,16
8,22,99,60.0,19.0,43
9,18,54,16.0,46.0,62


In [64]:
# convert to dozens unit
# applied to every entry in the dataframe 

df.floordiv(12)

Unnamed: 0,A,B,C,D,F
0,6,1,4.0,0.0,0
1,4,2,5.0,5.0,1
2,3,0,3.0,7.0,3
3,0,7,7.0,0.0,7
4,2,4,8.0,5.0,4
5,0,0,,,6
6,7,2,,,5
7,0,4,,,1
8,1,8,5.0,1.0,3
9,1,4,1.0,3.0,5


In [65]:
# Alternatively 
# using numpy floor_divide function 

np.floor_divide(df, 12)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,A,B,C,D,F
0,6.0,1.0,4.0,0.0,0.0
1,4.0,2.0,5.0,5.0,1.0
2,3.0,0.0,3.0,7.0,3.0
3,0.0,7.0,7.0,0.0,7.0
4,2.0,4.0,8.0,5.0,4.0
5,0.0,0.0,,,6.0
6,7.0,2.0,,,5.0
7,0.0,4.0,,,1.0
8,1.0,8.0,5.0,1.0,3.0
9,1.0,4.0,1.0,3.0,5.0


In [66]:
# Alternatively 
# using python custom function

def dozens(n):
    return n//12

df.apply(dozens)

Unnamed: 0,A,B,C,D,F
0,6,1,4.0,0.0,0
1,4,2,5.0,5.0,1
2,3,0,3.0,7.0,3
3,0,7,7.0,0.0,7
4,2,4,8.0,5.0,4
5,0,0,,,6
6,7,2,,,5
7,0,4,,,1
8,1,8,5.0,1.0,3
9,1,4,1.0,3.0,5


In [74]:
# Alternatively 
# using lambda 

df.apply(lambda n: n//12)

Unnamed: 0,a,b,c,d,f,z
0,6,1,4.0,0.0,0,4.0
1,4,2,5.0,5.0,1,7.0
2,3,0,3.0,7.0,3,7.0
3,0,7,7.0,0.0,7,14.0
4,2,4,8.0,5.0,4,13.0
5,0,0,,,6,
6,7,2,,,5,
7,0,4,,,1,
8,1,8,5.0,1.0,3,8.0
9,1,4,1.0,3.0,5,6.0


In [71]:
# manipulate string values
# for indexes use map method not apply method!!!

df.columns = df.columns.map(str.lower)
df

Unnamed: 0,a,b,c,d,f
0,74,22,56.0,1.0,0
1,50,29,64.0,62.0,22
2,43,10,41.0,89.0,45
3,4,92,87.0,1.0,86
4,27,52,97.0,69.0,59
5,9,9,,,80
6,94,26,,,65
7,2,48,,,16
8,22,99,60.0,19.0,43
9,18,54,16.0,46.0,62


In [72]:
# defining columns using other columns

df["z"] = df["c"] + df["f"]
df

Unnamed: 0,a,b,c,d,f,z
0,74,22,56.0,1.0,0,56.0
1,50,29,64.0,62.0,22,86.0
2,43,10,41.0,89.0,45,86.0
3,4,92,87.0,1.0,86,173.0
4,27,52,97.0,69.0,59,156.0
5,9,9,,,80,
6,94,26,,,65,
7,2,48,,,16,
8,22,99,60.0,19.0,43,103.0
9,18,54,16.0,46.0,62,78.0


### Using .map() with a dictionary

In [76]:
users

Unnamed: 0,name,city,age,wages,grade,height,weight
0,Ali,Rome,60,0,B,180.0,80.0
1,Behcet,Ankara,90,0,B,0.0,60.0
2,Cemal,London,80,0,,170.0,70.0
3,Davut,Paris,40,0,,,


In [78]:
color_dict = {"Rome":"blue", "Ankara":"yellow", "London":"greay"}

users["color"] = users["city"].map(color_dict)
users

Unnamed: 0,name,city,age,wages,grade,height,weight,color
0,Ali,Rome,60,0,B,180.0,80.0,blue
1,Behcet,Ankara,90,0,B,0.0,60.0,yellow
2,Cemal,London,80,0,,170.0,70.0,greay
3,Davut,Paris,40,0,,,,
