In [1]:
import numpy as np
import pandas as pd

# Pandas Series

In [2]:
# Pandas Series is only one dimensional: 1D
person = pd.Series([180,80.5,'male'], index=['height','weight','gender'])
person

height     180
weight    80.5
gender    male
dtype: object

In [3]:
person[0]

180

In [4]:
person[:2]

height     180
weight    80.5
dtype: object

In [5]:
person[[0,2]]

height     180
gender    male
dtype: object

In [6]:
person_stats = person[[0,1]]
print(person_stats)
person_stats[person_stats > 100]

height     180
weight    80.5
dtype: object


height    180
dtype: object

In [7]:
person['height'] = 170
person

height     170
weight    80.5
gender    male
dtype: object

In [8]:
print(person.get('height', np.nan))
print(person.get('age', np.nan))
#person['age']

170
nan


In [9]:
person_stats3 = person_stats[::-1]
person_stats3

weight    80.5
height     180
dtype: object

In [10]:
person_stats + person_stats3

height    360
weight    161
dtype: object

### Exercises

In [11]:
car = pd.Series([12345,2015,'Minimi'], index=['Mileage','Year','Model'])
print(car)

Mileage     12345
Year         2015
Model      Minimi
dtype: object


In [12]:
car[0]

12345

In [13]:
car[[0,-1]]

Mileage     12345
Model      Minimi
dtype: object

In [14]:
car['Year']

2015

In [15]:
print(car[['Mileage','Year']])

Mileage    12345
Year        2015
dtype: object


In [16]:
car['Mileage'] = 432435
print(car)

Mileage    432435
Year         2015
Model      Minimi
dtype: object


In [17]:
car['Mileage'] = car['Mileage'] *2
print(car)

Mileage    864870
Year         2015
Model      Minimi
dtype: object


In [18]:
new_car_series = car[['Year','Mileage']]
print(new_car_series)

Year         2015
Mileage    864870
dtype: object


In [19]:
second_car = pd.Series({'Mileage':123345,'Year':2017,'Model':'Maximum'})
print(second_car)

Mileage     123345
Model      Maximum
Year          2017
dtype: object


In [20]:
add_up = car[['Mileage','Year']] + second_car[['Mileage','Year']]
print(add_up)

Mileage    988215
Year         4032
dtype: object


# Data Frames

In [21]:
my_df = pd.DataFrame([[19,165,'female'],[19,177,'male'],[23,169,'female']], 
                     index=['Observation1', 'Observation2','Observation3'], 
                     columns=['age','height','gender'])

my_df

Unnamed: 0,age,height,gender
Observation1,19,165,female
Observation2,19,177,male
Observation3,23,169,female


In [22]:
# introspection = looking inside the data how it looks like
my_df.index

Index(['Observation1', 'Observation2', 'Observation3'], dtype='object')

In [23]:
my_df.columns

Index(['age', 'height', 'gender'], dtype='object')

In [24]:
len(my_df)

3

In [25]:
my_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,3.0,20.333333,2.309401,19.0,19.0,19.0,21.0,23.0
height,3.0,170.333333,6.110101,165.0,167.0,169.0,173.0,177.0


In [26]:
# indexing

In [27]:
my_df.age

Observation1    19
Observation2    19
Observation3    23
Name: age, dtype: int64

In [28]:
my_df['age']

Observation1    19
Observation2    19
Observation3    23
Name: age, dtype: int64

In [29]:
my_df[['age','height']]

Unnamed: 0,age,height
Observation1,19,165
Observation2,19,177
Observation3,23,169


In [30]:
my_df.loc['Observation1']

age           19
height       165
gender    female
Name: Observation1, dtype: object

In [31]:
my_df.loc[['Observation1','Observation3'],['age','gender']]

Unnamed: 0,age,gender
Observation1,19,female
Observation3,23,female


In [32]:
my_df.iloc[1]

age         19
height     177
gender    male
Name: Observation2, dtype: object

In [33]:
my_df.iloc[0:2]

Unnamed: 0,age,height,gender
Observation1,19,165,female
Observation2,19,177,male


In [34]:
my_df.iloc[0:2, [0, 2]]

Unnamed: 0,age,gender
Observation1,19,female
Observation2,19,male


In [35]:
# create DataFrame using a dictionary

my_df2 = pd.DataFrame({'age':np.array([19,34,6,56,34]),
                      'height':np.array([123,156,134,45,186]),
                      'gender':np.array(['female','male','female','male','female'])},
                      index=['Observation {0}'.format(i) for i in range(5)])
my_df2

Unnamed: 0,age,gender,height
Observation 0,19,female,123
Observation 1,34,male,156
Observation 2,6,female,134
Observation 3,56,male,45
Observation 4,34,female,186


In [36]:
my_df2 = my_df2[['age','height','gender']]
my_df2

Unnamed: 0,age,height,gender
Observation 0,19,123,female
Observation 1,34,156,male
Observation 2,6,134,female
Observation 3,56,45,male
Observation 4,34,186,female


In [37]:
my_df2.head()

Unnamed: 0,age,height,gender
Observation 0,19,123,female
Observation 1,34,156,male
Observation 2,6,134,female
Observation 3,56,45,male
Observation 4,34,186,female


In [38]:
my_df2.tail()

Unnamed: 0,age,height,gender
Observation 0,19,123,female
Observation 1,34,156,male
Observation 2,6,134,female
Observation 3,56,45,male
Observation 4,34,186,female


In [39]:
my_df2.sort_values('age',ascending=True)

Unnamed: 0,age,height,gender
Observation 2,6,134,female
Observation 0,19,123,female
Observation 1,34,156,male
Observation 4,34,186,female
Observation 3,56,45,male


In [40]:
my_df2.sort_values('age',ascending=True, inplace=True)
my_df2

Unnamed: 0,age,height,gender
Observation 2,6,134,female
Observation 0,19,123,female
Observation 1,34,156,male
Observation 4,34,186,female
Observation 3,56,45,male


In [41]:
my_df2.sample(3)

Unnamed: 0,age,height,gender
Observation 0,19,123,female
Observation 4,34,186,female
Observation 1,34,156,male


In [42]:
my_df2[my_df2.age > 50]

Unnamed: 0,age,height,gender
Observation 3,56,45,male


In [43]:
def gender_to_numeric(gender):
    if gender == 'male':
        return 0
    else:
        return 1
    
my_df2['gender_num'] = my_df2['gender'].apply(gender_to_numeric)
my_df2

Unnamed: 0,age,height,gender,gender_num
Observation 2,6,134,female,1
Observation 0,19,123,female,1
Observation 1,34,156,male,0
Observation 4,34,186,female,1
Observation 3,56,45,male,0


In [44]:
my_df2.drop('gender_num', axis=1, inplace=False)

Unnamed: 0,age,height,gender
Observation 2,6,134,female
Observation 0,19,123,female
Observation 1,34,156,male
Observation 4,34,186,female
Observation 3,56,45,male


### Exercises

In [45]:
car_df = pd.DataFrame({'year':np.array([1999,2043,1888,2004,2005]),
                      'mileage':np.array([123000,156,1340,4500,186000]),
                      'is_automatic':[True,False,True,False,True]},
                      index=['Car {0}'.format(i) for i in range(5)])
car_df = car_df[['year','mileage','is_automatic']]
car_df

Unnamed: 0,year,mileage,is_automatic
Car 0,1999,123000,True
Car 1,2043,156,False
Car 2,1888,1340,True
Car 3,2004,4500,False
Car 4,2005,186000,True


In [46]:
car_df['mileage']

Car 0    123000
Car 1       156
Car 2      1340
Car 3      4500
Car 4    186000
Name: mileage, dtype: int64

In [47]:
car_df[['mileage','year']]

Unnamed: 0,mileage,year
Car 0,123000,1999
Car 1,156,2043
Car 2,1340,1888
Car 3,4500,2004
Car 4,186000,2005


In [48]:
car_df.iloc[0]

year              1999
mileage         123000
is_automatic      True
Name: Car 0, dtype: object

In [49]:
car_df.loc['Car 0']

year              1999
mileage         123000
is_automatic      True
Name: Car 0, dtype: object

In [50]:
car_df.iloc[:3]

Unnamed: 0,year,mileage,is_automatic
Car 0,1999,123000,True
Car 1,2043,156,False
Car 2,1888,1340,True


In [51]:
car_df.iloc[[0,2],[1,2]]

Unnamed: 0,mileage,is_automatic
Car 0,123000,True
Car 2,1340,True


In [52]:
car_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,5.0,1987.8,58.503846,1888.0,1999.0,2004.0,2005.0,2043.0
mileage,5.0,62999.2,86461.803608,156.0,1340.0,4500.0,123000.0,186000.0


In [53]:
print(car_df.head())
print('\n')
print(car_df.tail())

       year  mileage  is_automatic
Car 0  1999   123000          True
Car 1  2043      156         False
Car 2  1888     1340          True
Car 3  2004     4500         False
Car 4  2005   186000          True


       year  mileage  is_automatic
Car 0  1999   123000          True
Car 1  2043      156         False
Car 2  1888     1340          True
Car 3  2004     4500         False
Car 4  2005   186000          True


In [54]:
car_df.sample(3)

Unnamed: 0,year,mileage,is_automatic
Car 2,1888,1340,True
Car 0,1999,123000,True
Car 4,2005,186000,True


In [55]:
car_df[car_df['mileage'] > 30000]

Unnamed: 0,year,mileage,is_automatic
Car 0,1999,123000,True
Car 4,2005,186000,True


In [56]:
car_df[(car_df['mileage'] > 30000) & (car_df['is_automatic'] == True)]

Unnamed: 0,year,mileage,is_automatic
Car 0,1999,123000,True
Car 4,2005,186000,True


In [57]:
car_df.sort_values('mileage', axis=0,ascending=False)

Unnamed: 0,year,mileage,is_automatic
Car 4,2005,186000,True
Car 0,1999,123000,True
Car 3,2004,4500,False
Car 2,1888,1340,True
Car 1,2043,156,False


In [58]:
def bool_to_num(boolean):
    if boolean == True:
        return 1
    else:
        return 0

car_df['is_automatic_num'] = car_df['is_automatic'].apply(bool_to_num)
car_df

Unnamed: 0,year,mileage,is_automatic,is_automatic_num
Car 0,1999,123000,True,1
Car 1,2043,156,False,0
Car 2,1888,1340,True,1
Car 3,2004,4500,False,0
Car 4,2005,186000,True,1


In [59]:
car_df.drop('is_automatic_num', axis=1, inplace=False)

Unnamed: 0,year,mileage,is_automatic
Car 0,1999,123000,True
Car 1,2043,156,False
Car 2,1888,1340,True
Car 3,2004,4500,False
Car 4,2005,186000,True


# Merging and joining data frames

In [60]:
# First data frame
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']
}
df_a = pd.DataFrame(raw_data,
                    columns=['subject_id', 'first_name', 'last_name'])
df_a

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches


In [61]:
# Second data frame
raw_data = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']
}
df_b = pd.DataFrame(raw_data, columns=['subject_id', 'first_name', 'last_name'])
df_b

Unnamed: 0,subject_id,first_name,last_name
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [62]:
df_new = pd.concat([df_a, df_b])
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [63]:
pd.concat([df_a,df_b[['first_name','last_name']]])

Unnamed: 0,first_name,last_name,subject_id
0,Alex,Anderson,1.0
1,Amy,Ackerman,2.0
2,Allen,Ali,3.0
3,Alice,Aoni,4.0
4,Ayoung,Atiches,5.0
0,Billy,Bonder,
1,Brian,Black,
2,Bran,Balwner,
3,Bryce,Brice,
4,Betty,Btisan,


In [64]:
pd.concat([df_a,df_b], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner
3,4,Alice,Aoni,7,Bryce,Brice
4,5,Ayoung,Atiches,8,Betty,Btisan


In [65]:
pd.concat([df_a , df_b.iloc[0:3]], axis=1)

Unnamed: 0,subject_id,first_name,last_name,subject_id.1,first_name.1,last_name.1
0,1,Alex,Anderson,4.0,Billy,Bonder
1,2,Amy,Ackerman,5.0,Brian,Black
2,3,Allen,Ali,6.0,Bran,Balwner
3,4,Alice,Aoni,,,
4,5,Ayoung,Atiches,,,


In [66]:
raw_data = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_n

Unnamed: 0,subject_id,test_id
0,1,51
1,2,15
2,3,15
3,4,61
4,5,16
5,7,14
6,8,15
7,9,1
8,10,61
9,11,16


In [67]:
# joining == merging

In [69]:
df_new

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


In [70]:
pd.merge(df_new, df_n, on='subject_id')

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,4,Billy,Bonder,61
5,5,Ayoung,Atiches,16
6,5,Brian,Black,16
7,7,Bryce,Brice,14
8,8,Betty,Btisan,15


In [71]:
pd.merge(df_a,df_b,on='subject_id',how='inner')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [72]:
pd.merge(df_a, df_b, on='subject_id', how='outer')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan


In [73]:
pd.merge(df_a, df_b, on='subject_id', how='left')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


In [74]:
pd.merge(df_a, df_b, on='subject_id', how='right')

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black
2,6,,,Bran,Balwner
3,7,,,Bryce,Brice
4,8,,,Betty,Btisan


## Exercises for Merging Data

In [75]:
# First set of people and their names
raw_data = {
        'person_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']
}
people_df = pd.DataFrame(raw_data)

In [76]:
# Second set of people and their names
raw_data = {
        'person_id': ['6', '7', '8', '9', '10'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']
}
people_df_2 = pd.DataFrame(raw_data)

In [77]:
# Cars owned by people
raw_data = {
        'person_id': ['1', '2', '12', '6', '7', '11'],
        'car_model': ['Ford Focus', 'BMW M3', 'Lotus Elise',
                      'Toyota Camry', 'Kia Picanto', 'Honda Accord'],
        'car_id': ['1', '2', '3', '4', '5', '6']
}
car_owners_df = pd.DataFrame(raw_data)

In [86]:
all_people_df = pd.concat([people_df, people_df_2])
all_people_df

Unnamed: 0,first_name,last_name,person_id
0,Alex,Anderson,1
1,Amy,Ackerman,2
2,Allen,Ali,3
3,Alice,Aoni,4
4,Ayoung,Atiches,5
0,Billy,Bonder,6
1,Brian,Black,7
2,Bran,Balwner,8
3,Bryce,Brice,9
4,Betty,Btisan,10


In [93]:
pd.merge(all_people_df, car_owners_df, on='person_id', how='inner')

Unnamed: 0,first_name,last_name,person_id,car_id,car_model
0,Alex,Anderson,1,1,Ford Focus
1,Amy,Ackerman,2,2,BMW M3
2,Billy,Bonder,6,4,Toyota Camry
3,Brian,Black,7,5,Kia Picanto


In [97]:
pd.merge(all_people_df, car_owners_df, on='person_id', how='left')

Unnamed: 0,first_name,last_name,person_id,car_id,car_model
0,Alex,Anderson,1,1.0,Ford Focus
1,Amy,Ackerman,2,2.0,BMW M3
2,Allen,Ali,3,,
3,Alice,Aoni,4,,
4,Ayoung,Atiches,5,,
5,Billy,Bonder,6,4.0,Toyota Camry
6,Brian,Black,7,5.0,Kia Picanto
7,Bran,Balwner,8,,
8,Bryce,Brice,9,,
9,Betty,Btisan,10,,


In [98]:
# Create a data frame showing the car owned by each person. This time, don't include people who don't own 
# a car but do include any car for which we don't have the owner's name.
pd.merge(all_people_df, car_owners_df, on='person_id', how='right')

Unnamed: 0,first_name,last_name,person_id,car_id,car_model
0,Alex,Anderson,1,1,Ford Focus
1,Amy,Ackerman,2,2,BMW M3
2,Billy,Bonder,6,4,Toyota Camry
3,Brian,Black,7,5,Kia Picanto
4,,,12,3,Lotus Elise
5,,,11,6,Honda Accord


In [99]:
# Create a data frame showing the car owned by each person. This time, include people who don't own 
# a car and also include any car for which we don't have the owner's name.
pd.merge(all_people_df, car_owners_df, on='person_id', how='outer')

Unnamed: 0,first_name,last_name,person_id,car_id,car_model
0,Alex,Anderson,1,1.0,Ford Focus
1,Amy,Ackerman,2,2.0,BMW M3
2,Allen,Ali,3,,
3,Alice,Aoni,4,,
4,Ayoung,Atiches,5,,
5,Billy,Bonder,6,4.0,Toyota Camry
6,Brian,Black,7,5.0,Kia Picanto
7,Bran,Balwner,8,,
8,Bryce,Brice,9,,
9,Betty,Btisan,10,,
