# Pandas Tutorial

In [52]:
import pandas as pd
import numpy as np

## Loading data

In [3]:
df = pd.read_csv('Downloads/friend_list.csv')

In [4]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [5]:
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [6]:
df.tail()

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


tab으로 구분되는 경우 pd.read_csv, delimiter = '\t'

In [7]:
df = pd.read_csv('Downloads/friend_list_no_head.csv', header = None)

df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [8]:
df.columns = ['name','age','job']

df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [9]:
df = pd.read_csv('Downloads/friend_list_no_head.csv',
                 header = None, names = ['name','age','job'])

df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## Make DataFrame

In [10]:
friend_dict_list = [
    {'name':'john', 'age':25, 'job':'student'},
    {'name':'nate', 'age':30, 'job':'teacher'},
    
]

In [11]:
df = pd.DataFrame(friend_dict_list)

In [12]:
df.head()

# dictionary는 키 값의 순서가 보장되지 않음

Unnamed: 0,age,job,name
0,25,student,john
1,30,teacher,nate


In [13]:
df = df[['name','age','job']]

df

Unnamed: 0,name,age,job
0,john,25,student
1,nate,30,teacher


In [14]:
# key의 순서를 보장

from collections import OrderedDict

In [15]:
friend_ordered_dict = OrderedDict(
    [
        ('name',['john', 'nate']),
        ('age', [25,30]),
        ('job', ['student','teacher'])
    ]
)

In [16]:
df = pd.DataFrame.from_dict(friend_ordered_dict)
df

Unnamed: 0,name,age,job
0,john,25,student
1,nate,30,teacher


In [17]:
freind_list = [
    ['john',20,'student'],
    ['nate', 30, 'teacher']
]

In [18]:
columns_name = ['name','age','job']

In [19]:
df = pd.DataFrame.from_records(freind_list, columns = columns_name)

df

Unnamed: 0,name,age,job
0,john,20,student
1,nate,30,teacher


한 번에 하기

In [20]:
freind_list = [
    ['name', ['john','nate']],
    ['age', [20,30]],
    ['job', ['student','teacher']]
]

In [21]:
df = pd.DataFrame.from_items(freind_list)

df

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,age,job
0,john,20,student
1,nate,30,teacher


## Write DataFrame To File

In [22]:
df.to_csv('freind.csv')
          
# index = False로 하면 index 값 제거
# na_rep = '-' : NaN은 -로 표시

## Select, Filter Row and Columns

In [23]:
freind_list = OrderedDict(
    [
    ('name', ['john','nate', 'jenny']),
    ('age', [20,30, 30]),
    ('job', ['student','teacher','developer'])
    ])

df = pd.DataFrame.from_dict(freind_list)

In [24]:
df

Unnamed: 0,name,age,job
0,john,20,student
1,nate,30,teacher
2,jenny,30,developer


슬라이싱

In [25]:
df[1:3]

Unnamed: 0,name,age,job
1,nate,30,teacher
2,jenny,30,developer


불연속적인 추출 : loc

In [26]:
df.loc[[0,2]]

Unnamed: 0,name,age,job
0,john,20,student
2,jenny,30,developer


### by colmuns condition

In [27]:
df[(df.age > 25) & (df.name == 'nate')]

Unnamed: 0,name,age,job
1,nate,30,teacher


### Filter Columns

### by Index

In [28]:
freind_list = [
    ['john',20,'student'],
    ['jenny', 30, 'developer'],
    ['nate', 30, 'teacher']
]

df = pd.DataFrame.from_records(freind_list)
df

Unnamed: 0,0,1,2
0,john,20,student
1,jenny,30,developer
2,nate,30,teacher


In [29]:
df.iloc[:,0:2]

Unnamed: 0,0,1
0,john,20
1,jenny,30
2,nate,30


### by column name

In [30]:
df = pd.read_csv("Downloads/friend_list_no_head.csv", header = None,
                names = ['name','age','job'])

In [31]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [32]:
df_filtered = df[['name','age']]

df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [33]:
# 효율적으로 filter 가능

df.filter(items = ['age','job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [34]:
df.filter(like='a',axis = 1)

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


정규식을 사용

In [35]:
df.filter(regex = 'b$', axis = 1)

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


##  Drop row or columns

In [36]:
friends = [{'age' : 15, 'job': 'student'},
          {'age': 25, 'job': 'developer'},
          {'age':30, 'job': 'teacher'}]

df= pd.DataFrame(friends, index = ['John', 'jenny', 'nate'],
                columns = ['age','job'])

df

Unnamed: 0,age,job
John,15,student
jenny,25,developer
nate,30,teacher


drop()

In [37]:
df.drop(['John','nate'])

Unnamed: 0,age,job
jenny,25,developer


inplace = True를 사용하면  assign 할 필요가 없음

In [38]:
df.drop(['John','nate'], inplace = True)

In [39]:
friends = [{'name': 'john','age': 15, 'job': 'student'},
          {'name': 'ben','age': 25, 'job': 'developer'},
          {'name': 'jenny','age': 30, 'job': 'teacher'}]

df = pd.DataFrame(friends, columns = ['name','age','job'])

In [40]:
df

Unnamed: 0,name,age,job
0,john,15,student
1,ben,25,developer
2,jenny,30,teacher


index 사용

In [41]:
df = df.drop(df.index[[0,2]])

In [42]:
df

Unnamed: 0,name,age,job
1,ben,25,developer


In [48]:
friends = [{'name': 'john','age': 15, 'job': 'student'},
          {'name': 'ben','age': 25, 'job': 'developer'},
          {'name': 'jenny','age': 30, 'job': 'teacher'}]

df = pd.DataFrame(friends, columns = ['name','age','job'])

In [44]:
df = df[df.age > 20]

df

Unnamed: 0,name,age,job
1,ben,25,developer
2,jenny,30,teacher


columns 삭제

In [45]:
df.drop('age', axis = 1)

Unnamed: 0,name,job
1,ben,developer
2,jenny,teacher


In [46]:
df.drop('age', axis = 1) # inplace 사용 가능

Unnamed: 0,name,job
1,ben,developer
2,jenny,teacher


## row, coolumn create, update

In [51]:
df['salary'] = 0

df

Unnamed: 0,name,age,job,salary
0,john,15,student,0
1,ben,25,developer,0
2,jenny,30,teacher,0


In [54]:
df['salary'] = np.where(df['job'] != 'student', 'yes','no')

df

Unnamed: 0,name,age,job,salary
0,john,15,student,no
1,ben,25,developer,yes
2,jenny,30,teacher,yes


In [56]:
friends = [{'name': 'john','midterm': 95, 'final': 85},
          {'name': 'ben','midterm': 80, 'final': 70},
          {'name': 'jenny','midterm': 30, 'final': 10}]

df = pd.DataFrame(friends, columns = ['name','midterm','final'])

df

Unnamed: 0,name,midterm,final
0,john,95,85
1,ben,80,70
2,jenny,30,10


In [58]:
df['total'] = df['midterm'] + df['final']

df

Unnamed: 0,name,midterm,final,total
0,john,95,85,180
1,ben,80,70,150
2,jenny,30,10,40


In [60]:
df['average'] = df['total'] / 2

df

Unnamed: 0,name,midterm,final,total,average
0,john,95,85,180,90.0
1,ben,80,70,150,75.0
2,jenny,30,10,40,20.0


In [65]:
grades = []

for row in df['average']:
    if row >= 90:
        grades.append('A')
    elif row >= 70:
        grades.append('B')
    else:
        grades.append('F')
        
df['grade'] = grades

In [66]:
df

Unnamed: 0,name,midterm,final,total,average,grade
0,john,95,85,180,90.0,A
1,ben,80,70,150,75.0,B
2,jenny,30,10,40,20.0,F


In [67]:
def pass_or_fail(row):
    if row != 'F':
        return "Pass"
    else:
        return "Fail"

In [68]:
df.grade = df.grade.apply(pass_or_fail)

In [69]:
df

Unnamed: 0,name,midterm,final,total,average,grade
0,john,95,85,180,90.0,Pass
1,ben,80,70,150,75.0,Pass
2,jenny,30,10,40,20.0,Fail


feature extraction

In [71]:
date_list = [
    {
        'yyyy-mm-dd' : '2006-06-27'
    },
    {
        'yyyy-mm-dd' : '2007-10-27'
    }
]

df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])

In [72]:
df

Unnamed: 0,yyyy-mm-dd
0,2006-06-27
1,2007-10-27


In [3]:
def extract_year(row):
    return row.split('-')[0]

In [2]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)

NameError: name 'df' is not defined

In [79]:
df

Unnamed: 0,yyyy-mm-dd,year
0,2006-06-27,2006
1,2007-10-27,2007


In [80]:
friends = [{'name': 'john','midterm': 95, 'final': 85},
          {'name': 'ben','midterm': 80, 'final': 70},
          {'name': 'jenny','midterm': 30, 'final': 10}]

df = pd.DataFrame(friends, columns = ['name','midterm','final'])

df

Unnamed: 0,name,midterm,final
0,john,95,85
1,ben,80,70
2,jenny,30,10


행 추가

In [82]:
df2 = pd.DataFrame([
    ['ben', 50,50]
], columns = ['name','midterm','final'])

In [83]:
df2

Unnamed: 0,name,midterm,final
0,ben,50,50


In [84]:
df.append(df2, ignore_index= True)

Unnamed: 0,name,midterm,final
0,john,95,85
1,ben,80,70
2,jenny,30,10
3,ben,50,50


## Group by

In [87]:
student_list = [{'name' : 'john', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'nate', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'abraham', 'major': 'physics', 'sex' : 'male'},
                {'name' : 'brian', 'major': 'psychology', 'sex' : 'female'},
                {'name' : 'janny', 'major': 'economics', 'sex' : 'female'},
                {'name' : 'yuna', 'major': 'economics', 'sex' : 'female'},
                {'name' : 'jennifer', 'major': 'Computer Science', 'sex' : 'female'},
                {'name' : 'edwards', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'zara', 'major': 'psychology', 'sex' : 'female'},
                {'name' : 'wendy', 'major': 'economics', 'sex' : 'female'},
                {'name' : 'sera', 'major': 'psychology', 'sex' : 'female'},
]

df = pd.DataFrame(student_list, columns = ['name', 'major','sex'])

df

Unnamed: 0,name,major,sex
0,john,Computer Science,male
1,nate,Computer Science,male
2,abraham,physics,male
3,brian,psychology,female
4,janny,economics,female
5,yuna,economics,female
6,jennifer,Computer Science,female
7,edwards,Computer Science,male
8,zara,psychology,female
9,wendy,economics,female


In [91]:
groupby_major = df.groupby('major')

In [92]:
for name, group in groupby_major:
    print(name + " : " + str(len(group)))
    print(group)
    print()

Computer Science : 4
       name             major     sex
0      john  Computer Science    male
1      nate  Computer Science    male
6  jennifer  Computer Science  female
7   edwards  Computer Science    male

economics : 3
    name      major     sex
4  janny  economics  female
5   yuna  economics  female
9  wendy  economics  female

physics : 1
      name    major   sex
2  abraham  physics  male

psychology : 3
     name       major     sex
3   brian  psychology  female
8    zara  psychology  female
10   sera  psychology  female



In [95]:
df_major_cnt = pd.DataFrame( {'count' : groupby_major.size()}).reset_index()

df_major_cnt

Unnamed: 0,major,count
0,Computer Science,4
1,economics,3
2,physics,1
3,psychology,3


In [96]:
groupby_sex = df.groupby('sex')

In [97]:
for name, group in groupby_sex:
    print(name + " : " + str(len(group)))
    print(group)
    print()

female : 7
        name             major     sex
3      brian        psychology  female
4      janny         economics  female
5       yuna         economics  female
6   jennifer  Computer Science  female
8       zara        psychology  female
9      wendy         economics  female
10      sera        psychology  female

male : 4
      name             major   sex
0     john  Computer Science  male
1     nate  Computer Science  male
2  abraham           physics  male
7  edwards  Computer Science  male



## How to remove duplicate row in data frame

In [99]:
student_list = [{'name' : 'john', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'nate', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'abraham', 'major': 'physics', 'sex' : 'male'},
                {'name' : 'brian', 'major': 'psychology', 'sex' : 'female'},
                {'name' : 'john', 'major': 'Computer Science', 'sex' : 'male'}
]

df = pd.DataFrame(student_list, columns = ['name', 'major','sex'])

df

Unnamed: 0,name,major,sex
0,john,Computer Science,male
1,nate,Computer Science,male
2,abraham,physics,male
3,brian,psychology,female
4,john,Computer Science,male


In [100]:
df.duplicated()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [101]:
df.drop_duplicates()

Unnamed: 0,name,major,sex
0,john,Computer Science,male
1,nate,Computer Science,male
2,abraham,physics,male
3,brian,psychology,female


In [102]:
student_list = [{'name' : 'john', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'nate', 'major': 'Computer Science', 'sex' : 'male'},
                {'name' : 'abraham', 'major': 'physics', 'sex' : 'male'},
                {'name' : 'brian', 'major': 'psychology', 'sex' : 'female'},
                {'name' : 'john', 'major': 'Economics', 'sex' : 'male'},
                {'name' : 'yuna', 'major': 'economics', 'sex' : 'female'},
                {'name' : 'jennifer', 'major': 'Computer Science', 'sex' : 'female'},
                {'name' : 'nate', 'major': None, 'sex' : 'female'}
]

df = pd.DataFrame(student_list, columns = ['name', 'major','sex'])

df

Unnamed: 0,name,major,sex
0,john,Computer Science,male
1,nate,Computer Science,male
2,abraham,physics,male
3,brian,psychology,female
4,john,Economics,male
5,yuna,economics,female
6,jennifer,Computer Science,female
7,nate,,female


In [107]:
df.duplicated()

# 모든 행이 일치하지는 않으므로.

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [109]:
# key value를 인자로 지정

df.duplicated(['name'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
7     True
dtype: bool

In [111]:
df.drop_duplicates(['name'], keep = 'first')

# keep = last는 마지막 행.

Unnamed: 0,name,major,sex
0,john,Computer Science,male
1,nate,Computer Science,male
2,abraham,physics,male
3,brian,psychology,female
5,yuna,economics,female
6,jennifer,Computer Science,female


## Find NaN, update what I want to.

In [114]:
school_id_list = [
    {'name' : 'john', 'job': 'teacher', 'age' : 40},
    {'name' : 'nate', 'job': 'teacher', 'age' : 35},
    {'name' : 'yuna', 'job': 'teacher', 'age' : 37},
    {'name' : 'abraham', 'job': 'student', 'age' : 10},
    {'name' : 'brian', 'job': 'student', 'age' : 12},
    {'name' : 'janny', 'job': 'student', 'age' : 11},
    {'name' : 'nate', 'job': 'teacher', 'age' : None},
    {'name' : 'john', 'job': 'student', 'age' : None}
]

df = pd.DataFrame(school_id_list, columns = ['name','job','age'])
df

Unnamed: 0,name,job,age
0,john,teacher,40.0
1,nate,teacher,35.0
2,yuna,teacher,37.0
3,abraham,student,10.0
4,brian,student,12.0
5,janny,student,11.0
6,nate,teacher,
7,john,student,


In [115]:
df.shape

(8, 3)

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
name    8 non-null object
job     8 non-null object
age     6 non-null float64
dtypes: float64(1), object(2)
memory usage: 272.0+ bytes


In [118]:
df.isna()

# df.innull()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [119]:
df.age = df.age.fillna(0)

In [120]:
df

Unnamed: 0,name,job,age
0,john,teacher,40.0
1,nate,teacher,35.0
2,yuna,teacher,37.0
3,abraham,student,10.0
4,brian,student,12.0
5,janny,student,11.0
6,nate,teacher,0.0
7,john,student,0.0


In [121]:
school_id_list = [
    {'name' : 'john', 'job': 'teacher', 'age' : 40},
    {'name' : 'nate', 'job': 'teacher', 'age' : 35},
    {'name' : 'yuna', 'job': 'teacher', 'age' : 37},
    {'name' : 'abraham', 'job': 'student', 'age' : 10},
    {'name' : 'brian', 'job': 'student', 'age' : 12},
    {'name' : 'janny', 'job': 'student', 'age' : 11},
    {'name' : 'nate', 'job': 'teacher', 'age' : None},
    {'name' : 'john', 'job': 'student', 'age' : None}
]

df = pd.DataFrame(school_id_list, columns = ['name','job','age'])
df

Unnamed: 0,name,job,age
0,john,teacher,40.0
1,nate,teacher,35.0
2,yuna,teacher,37.0
3,abraham,student,10.0
4,brian,student,12.0
5,janny,student,11.0
6,nate,teacher,
7,john,student,


In [122]:
df['age'].fillna(df.groupby('job')['age'].transform('median'), inplace =True)

In [123]:
df

Unnamed: 0,name,job,age
0,john,teacher,40.0
1,nate,teacher,35.0
2,yuna,teacher,37.0
3,abraham,student,10.0
4,brian,student,12.0
5,janny,student,11.0
6,nate,teacher,37.0
7,john,student,11.0


## Apply

In [128]:
date_list = [
    {'yyyy-mm-dd' : '2006-06-27'},
    {'yyyy-mm-dd' : '2002-9-24'},
    {'yyyy-mm-dd' : '2005-12-20'},
]

df = pd.DataFrame(date_list, columns = ['yyyy-mm-dd'])
df

Unnamed: 0,yyyy-mm-dd
0,2006-06-27
1,2002-9-24
2,2005-12-20


In [129]:
def extract_year(column):
    return column.split('-')[0]

In [130]:
df['year'] = df['yyyy-mm-dd'].apply(extract_year)

In [131]:
df

Unnamed: 0,yyyy-mm-dd,year
0,2006-06-27,2006
1,2002-9-24,2002
2,2005-12-20,2005


parameter를 보내기

In [134]:
def get_age(year, current_year):
    return current_year - int(year)

In [135]:
df['age'] = df['year'].apply(get_age, current_year = 2018)

In [136]:
df

Unnamed: 0,yyyy-mm-dd,year,age
0,2006-06-27,2006,12
1,2002-9-24,2002,16
2,2005-12-20,2005,13


In [138]:
def get_introduce(age, prefix, suffix):
    return prefix + str(age) + suffix

In [139]:
df['introduce'] = df['age'].apply(get_introduce, prefix = "I am", suffix = "years old")

In [140]:
df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2006-06-27,2006,12,I am12years old
1,2002-9-24,2002,16,I am16years old
2,2005-12-20,2005,13,I am13years old


In [144]:
def get_introduce2(row):
    return "I was born in" + str(row.year) + " my age is" + str(row.age)

In [145]:
df.introduce = df.apply(get_introduce2, axis = 1)

df

Unnamed: 0,yyyy-mm-dd,year,age,introduce
0,2006-06-27,2006,12,I was born in2006 my age is12
1,2002-9-24,2002,16,I was born in2002 my age is16
2,2005-12-20,2005,13,I was born in2005 my age is13


## Map, Applymap

In [152]:
date= [
    {'date' : '2006-06-27'},
    {'date' : '2002-9-24'},
    {'date' : '2005-12-20'},
]

df = pd.DataFrame(date, columns = ['date'])
df

Unnamed: 0,date
0,2006-06-27
1,2002-9-24
2,2005-12-20


In [155]:
def extract_year(date):
    return date.split('-')[0]

In [157]:
df['year'] = df['date'].map(extract_year)

df

Unnamed: 0,date,year
0,2006-06-27,2006
1,2002-9-24,2002
2,2005-12-20,2005


map은 다른 방법으로도 사용가능

In [164]:
friends = [{'name': 'john','age': 15, 'job': 'student'},
          {'name': 'ben','age': 25, 'job': 'developer'},
          {'name': 'jenny','age': 30, 'job': 'teacher'}]

df = pd.DataFrame(friends, columns = ['name','age','job'])

df

Unnamed: 0,name,age,job
0,john,15,student
1,ben,25,developer
2,jenny,30,teacher


In [165]:
df.job = df.job.map({'student' : 1, 'developer' : 2, 'teacher' : 3})

df

Unnamed: 0,name,age,job
0,john,15,1
1,ben,25,2
2,jenny,30,3


factor형으로 변경이 쉽다.

In [168]:
x_y = [
    {'x' : 5.5, 'y' :  -5.6, 'z' : -1.1},
    {'x' : -5.2, 'y' : 5.5, 'z' : -2.2},
    {'x': -1.6, 'y' : -4.5, 'z': -3.3}
]

df = pd.DataFrame(x_y)

df

Unnamed: 0,x,y,z
0,5.5,-5.6,-1.1
1,-5.2,5.5,-2.2
2,-1.6,-4.5,-3.3


모든 칼럼에 적용하고 싶다면 apply.map 사용

In [172]:
df = df.applymap(np.around) # 반올림

In [171]:
df

Unnamed: 0,x,y,z
0,6.0,-6.0,-1.0
1,-5.0,6.0,-2.0
2,-2.0,-4.0,-3.0


## Unique, Value_counts

In [175]:
job_list = [
    {'name' : 'john', 'job':'teacher'},
    {'name' : 'nate', 'job':'teacher'},
    {'name' : 'fred', 'job':'teacher'},
    {'name' : 'abraham', 'job':'student'},
    {'name' : 'janny', 'job':'developer'},
    {'name' : 'nate', 'job':'teacher'},
    {'name' : 'ian', 'job':'teacher'},
    {'name' : 'chris', 'job':'banker'},
    {'name' : 'philip', 'job':'lawyer'},
    {'name' : 'janny', 'job':'basketball player'},
    {'name' : 'gwen', 'job':'teacher'},
    {'name' : 'jessy', 'job':'student'},
]

df = pd.DataFrame(job_list, columns = ['name', 'job'])

df

Unnamed: 0,name,job
0,john,teacher
1,nate,teacher
2,fred,teacher
3,abraham,student
4,janny,developer
5,nate,teacher
6,ian,teacher
7,chris,banker
8,philip,lawyer
9,janny,basketball player


In [176]:
df.job.unique()

array(['teacher', 'student', 'developer', 'banker', 'lawyer',
       'basketball player'], dtype=object)

In [177]:
df.job.value_counts()

teacher              6
student              2
basketball player    1
banker               1
developer            1
lawyer               1
Name: job, dtype: int64

## Concat, Append

In [180]:
l1 = [
    {'name' : 'john', 'job' : 'teacher'},
    {'name' : 'nate', 'job' : 'student'},
    {'name' : 'fred', 'job' : 'developer'},
]

l2 = [
    {'name' : 'ed', 'job' : 'dentist'},
    {'name' : 'jack', 'job' : 'farmer'},
    {'name' : 'ted', 'job' : 'dentist'},
]

l3 = [
    {'name' : 'john', 'job' : 'teacher'},
    {'name' : 'nate', 'job' : 'student'},
    {'name' : 'jack', 'job' : 'fatmer'},
]

l4 = [
    {'age':25, 'country' : 'us'},
    {'age':30, 'country' : 'uk'},
    {'age':45, 'country' : 'korea'}
]

In [182]:
df1 = pd.DataFrame(l1, columns = ['name','job'])
df2 = pd.DataFrame(l2, columns = ['name','job'])

In [183]:
df1

Unnamed: 0,name,job
0,john,teacher
1,nate,student
2,fred,developer


In [184]:
df2

Unnamed: 0,name,job
0,ed,dentist
1,jack,farmer
2,ted,dentist


column name이 같음

In [188]:
result = pd.concat([df1,df2], ignore_index=True)

In [189]:
result

Unnamed: 0,name,job
0,john,teacher
1,nate,student
2,fred,developer
3,ed,dentist
4,jack,farmer
5,ted,dentist


In [192]:
result = df1.append(df2, ignore_index=True)

In [193]:
result

Unnamed: 0,name,job
0,john,teacher
1,nate,student
2,fred,developer
3,ed,dentist
4,jack,farmer
5,ted,dentist


열로 합치기

In [194]:
df3 = pd.DataFrame(l3, columns = ['name','job'])
df4 = pd.DataFrame(l4, columns = ['age', 'country'])

In [195]:
df3

Unnamed: 0,name,job
0,john,teacher
1,nate,student
2,jack,fatmer


In [196]:
df4

Unnamed: 0,age,country
0,25,us
1,30,uk
2,45,korea


In [197]:
result = pd.concat([df1,df2], axis = 1, ignore_index = True)

In [198]:
result

Unnamed: 0,0,1,2,3
0,john,teacher,ed,dentist
1,nate,student,jack,farmer
2,fred,developer,ted,dentist


list 합치기

머신러닝 중 실제값과 예측값을 비교하는 경우가 많음

In [200]:
label = [1,2,3,4,5]
prediction = [1,2,2,4,4]

In [201]:
comparison = pd.DataFrame({'label' : label, 'prediction' : prediction})

In [202]:
comparison

Unnamed: 0,label,prediction
0,1,1
1,2,2
2,3,2
3,4,4
4,5,4
