# Data Analysis & Pandas
## Pandas is a python package built on top of numpy
### Advantage over numpy
1. Flexibility (e.g. attaching labels to data)
2. Element-wise broadcasting (e.g. groupings, pivots, etc.)


In [34]:
import pandas as pd
import numpy as np

print(pd.__version__)

1.2.4


## Pandas Objects


### Creating a series using list of elements


In [35]:
l = [1,1,2,3,5,8,13]

print(pd.Series(l))
print(type(l))


0     1
1     1
2     2
3     3
4     5
5     8
6    13
dtype: int64
<class 'list'>


### Creating a DataFrame from Python List


In [25]:
#create python list
data = [[1000, 'Steve', 86.29], [1001, 'Mathew', 91.63], [1002, 'Jose', 72.90], [1003, 'Patty', 69.23], [1004, 'Vin', 88.30]]
print(data)
print(type(data))

#create dataframe from python list
data_df = pd.DataFrame(data, columns= ['Regd. No', 'Name', 'Marks%]'], index =[1,2,3,4,5])
print(data_df)
print(type(data_df))

[[1000, 'Steve', 86.29], [1001, 'Mathew', 91.63], [1002, 'Jose', 72.9], [1003, 'Patty', 69.23], [1004, 'Vin', 88.3]]
<class 'list'>
   Regd. No    Name  Marks%]
1      1000   Steve    86.29
2      1001  Mathew    91.63
3      1002    Jose    72.90
4      1003   Patty    69.23
5      1004     Vin    88.30
<class 'pandas.core.frame.DataFrame'>


In [26]:
#create numpy array from python list
narray = np.array(data)
print(narray)
print(type(narray))


[['1000' 'Steve' '86.29']
 ['1001' 'Mathew' '91.63']
 ['1002' 'Jose' '72.9']
 ['1003' 'Patty' '69.23']
 ['1004' 'Vin' '88.3']]
<class 'numpy.ndarray'>


In [30]:
### Creating a DataFrame from dictionary

data_dict = {'Regd. No': [1000, 1001, 1002, 1003, 1004], 'Names': ['Steve', 'Mathew', 'Jose', 'Patty', 'Vin'], 'Marks%': [86.29,91.63,72.90,69.23,88.30]}

pd.DataFrame(data_dict)

Unnamed: 0,Regd. No,Names,Marks%
0,1000,Steve,86.29
1,1001,Mathew,91.63
2,1002,Jose,72.9
3,1003,Patty,69.23
4,1004,Vin,88.3


In [31]:
print(type(data_dict))

<class 'dict'>


# Diving deep into Pandas

In [37]:
# An advantage of pandas series - index value is produced; numpy array does not include index values
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [38]:
# Pandas .values is like a numpy array
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [39]:
# Pandas .index shows the index
data.index

RangeIndex(start=0, stop=4, step=1)

In [40]:
data[1]

0.5

In [41]:
# splicing - when you index, start, stop (not included)
data[1:3]

1    0.50
2    0.75
dtype: float64

In [42]:
# Creating a series
# you can create any index you want - can be non-continuous
data = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [43]:
data['b']


0.5

In [44]:
# Creating a series from a specialized dictionary
pop_dict = {'CA': 38332521, 'TX': 26448193, 'NY': 19651127, 'FL': 19552860, 'IL': 12882135}
population = pd.Series(pop_dict)
population

CA    38332521
TX    26448193
NY    19651127
FL    19552860
IL    12882135
dtype: int64

In [48]:
population.values


array([38332521, 26448193, 19651127, 19552860, 12882135])

In [49]:
population.index

Index(['CA', 'TX', 'NY', 'FL', 'IL'], dtype='object')

In [58]:
print('pop_dict is a', type(pop_dict))
print(pop_dict['CA']) #dictionary

print('population is a', type(population))
print(population['CA']) #series



pop_dict is a <class 'dict'>
38332521
population is a <class 'pandas.core.series.Series'>
38332521


In [65]:
print('population is a', type(population))
print(population['CA':'IL']) #series

print('pop_dict is a', type(pop_dict))
print("print(pop_dict['CA':'IL']) <--throws a TypeError: unhashable type: slice')")

population is a <class 'pandas.core.series.Series'>
CA    38332521
TX    26448193
NY    19651127
FL    19552860
IL    12882135
dtype: int64
pop_dict is a <class 'dict'>
print(pop_dict['CA':'IL']) <--throws a TypeError: unhashable type: slice')


In [183]:
# Pandas DataFrame <-- think of as a 2D array - rows and columns
area_dict = {'CA': 423967, 'TX': 695662, 'NY': 141297, 'FL': 170312, 'IL': 149995}

area = pd.Series(area_dict)
area

CA    423967
TX    695662
NY    141297
FL    170312
IL    149995
dtype: int64

In [184]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
CA,38332521,423967
TX,26448193,695662
NY,19651127,141297
FL,19552860,170312
IL,12882135,149995


In [70]:
# DataFrame has an index attribute
states.index

Index(['CA', 'TX', 'NY', 'FL', 'IL'], dtype='object')

In [71]:
# DataFrame has columns attribute whihc is an index object holding the column labels
states.columns

Index(['population', 'area'], dtype='object')

In [72]:
# Pandas Index Object
# Think of it as an immutable array

ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [73]:
ind[1]

3

In [75]:
ind[::2] # printing every second successive element - start = 0, stop: end, step = 2

Int64Index([2, 5, 11], dtype='int64')

In [76]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [78]:
print("ind[1] = 0 throws TypeError: Index does not support mutable operations")

ind[1] = 0 throws TypeError: Index does not support mutable operations


# Data Indexing and Selection
## Data Selection in Series

In [79]:
data = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [80]:
data['e'] = 1.25 # appending
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [82]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [83]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [85]:
# fancy indexing - extract specific index values
data[['a','e', 'b']]

a    0.25
e    1.25
b    0.50
dtype: float64

## Indexers: loc, iloc

In [170]:
data1 = pd.Series(['a', 'b', 'c', 'd', 'e'], index =[1,2,3,5,9])
data1

1    a
2    b
3    c
5    d
9    e
dtype: object

In [171]:
# loc attribute references the explicit index (data1 index = 1,3,5)
data1.loc[1]

'a'

In [172]:
data1[1]

'a'

In [173]:
data1.loc[1:3]

1    a
2    b
3    c
dtype: object

In [166]:
data1.values

array(['a', 'b', 'c', 'd', 'e'], dtype=object)

In [169]:
# iloc attribute references the implicit Python-style index - the location (0,1,2,etc.)
data1.iloc[1]

'b'

In [142]:
print('data1', '\n',data1, type(data1), '\n')

print('data1.loc[1]=', data1.loc[1], '--> value with index 1')
print('data1.iloc[1]=', data1.iloc[1], '--> value in the position 1')


data1 
 1    a
2    b
3    c
5    d
9    e
dtype: object <class 'pandas.core.series.Series'> 

data1.loc[1]= a --> value with index 1
data1.iloc[1]= b --> value in the position 1


In [145]:
print('data1', '\n',data1, type(data1), '\n')

print('data1.loc[1:3]= \n', data1.loc[1:3], '--> value with index 1 to 3', '\n')
print('data1.iloc[1:3]= \n', data1.iloc[1:3], '--> value in the position 1 through position 3 (not including 3)')

data1 
 1    a
2    b
3    c
5    d
9    e
dtype: object <class 'pandas.core.series.Series'> 

data1.loc[1:3]= 
 1    a
2    b
3    c
dtype: object --> value with index 1 to 3 

data1.iloc[1:3]= 
 2    b
3    c
dtype: object --> value in the position 1 through position 3 (not including 3)


## Data Selection in DataFrame

In [149]:
print(area)
print(type(area))

CA    423967
TX    695662
NY    141297
FL    170312
IL     14995
dtype: int64
<class 'pandas.core.series.Series'>


In [152]:
print(population)
print(type(population))

CA    38332521
TX    26448193
NY    19651127
FL    19552860
IL    12882135
dtype: int64
<class 'pandas.core.series.Series'>


In [186]:
pop_area = pd.DataFrame({'area': area, 'pop': population})
pop_area

Unnamed: 0,area,pop
CA,423967,38332521
TX,695662,26448193
NY,141297,19651127
FL,170312,19552860
IL,149995,12882135


In [187]:
pop_area['area'] #calling area column

CA    423967
TX    695662
NY    141297
FL    170312
IL    149995
Name: area, dtype: int64

In [188]:
pop_area.area #calling area column

CA    423967
TX    695662
NY    141297
FL    170312
IL    149995
Name: area, dtype: int64

In [189]:
pop_area['density'] = pop_area['pop']/pop_area['area']
pop_area


Unnamed: 0,area,pop,density
CA,423967,38332521,90.413926
TX,695662,26448193,38.01874
NY,141297,19651127,139.076746
FL,170312,19552860,114.806121
IL,149995,12882135,85.883763


In [190]:
pop_area['density']

CA     90.413926
TX     38.018740
NY    139.076746
FL    114.806121
IL     85.883763
Name: density, dtype: float64

In [191]:
pop_area.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [192]:
pop_area.iloc[:3,:2] #all row until 3, all the columns until 2

Unnamed: 0,area,pop
CA,423967,38332521
TX,695662,26448193
NY,141297,19651127


In [193]:
pop_area.loc[:'IL', :'pop']

Unnamed: 0,area,pop
CA,423967,38332521
TX,695662,26448193
NY,141297,19651127
FL,170312,19552860
IL,149995,12882135


In [194]:
pop_area[pop_area.density > 100] #print density > 100

Unnamed: 0,area,pop,density
NY,141297,19651127,139.076746
FL,170312,19552860,114.806121


# Handling Missing Data

## NaN and None in Pandas

In [226]:
# NaN and None are interchangeable in Pandas
import numpy as np
data1 = pd.Series ([1, np.nan, 2, None])
# can perform mathematical operations on np.nan
# cannot perform on None - think of it as an operand

In [228]:
print(data1[0])
print(data1[3])
print(data1[0]+data1[3])

1.0
nan
nan


In [199]:
#isnull(): boolean mask
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [201]:
#notnull(): Opposite of isnull
data[data.notnull()]

0        1
2    hello
dtype: object

In [202]:
#Dropping null values - removes null values
data.dropna()

0        1
2    hello
dtype: object

In [203]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [207]:
df = pd.DataFrame([[1,np.nan,2], [2,3,5],[np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [209]:
# We cannont drop single values from a DataFrame - only full rows or full columns
df.dropna() #default drops rows

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [210]:
df.dropna(axis=0) #drops rows

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [211]:
df.dropna(axis=1) #drops columnns

Unnamed: 0,2
0,2
1,5
2,6


In [213]:
df[3]=np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [214]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [216]:
#Fill null values
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

In [217]:
data.fillna(7) #replace null values with specified values

a    1.0
b    7.0
c    2.0
d    7.0
e    3.0
dtype: float64

In [218]:
# forward-fill
data.fillna(method='ffill') #replace null value with value that is above it

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [220]:
# back-fill
print(data)
data.fillna(method='bfill') # replace null value with value that is below it

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [221]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [223]:
df.fillna(method='ffill', axis=1) # if there is no column before - remains null

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [224]:
df.fillna(method='bfill', axis=1) # if there is no column after - remains null

Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


# Pandas String Operations

In [238]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data] # capitalize the first letter of each string

['Peter', 'Paul', 'Mary', 'Guido']

In [239]:
data_n = ['peter', 'Paul', None, 'MARY', 'gUIDO']
print(data_n)
print("[s.capitalize() for s in data_n] <-- throws AttributeError: 'NoneType' object has no attribute 'capitalize'")

['peter', 'Paul', None, 'MARY', 'gUIDO']
[s.capitalize() for s in data] # capitalize the first letter of each string <-- throws error 'NoneType' object has no attribute 'capitalize'


In [243]:
data_nan = ['peter', 'Paul', np.nan, 'MARY', 'gUIDO']
print(data_nan)
print("[s.capitalize() for s in data_nan] <-- throws AttributeError: 'float' object has no attribute 'capitlize'")

['peter', 'Paul', nan, 'MARY', 'gUIDO']
[s.capitalize() for s in data_nan] <-- throws AttributeError: 'float' object has no attribute 'capitlize'


In [285]:
nums = [2,2,3,4,5, np.nan]
nums_add = []
j=0
while j < len(nums):
    print(j)
    nums_add.append(nums[j] + 1)
    j = j+1
nums_add

0
1
2
3
4
5


[3, 3, 4, 5, 6, nan]

In [290]:
#nums = [2,2,3,4,5, None] 
#nums_add = []
#j=0
#while j < len(nums):
#    print(j)
#    nums_add.append(nums[j] + 1)
#    j = j+1
#nums_add

# Throws TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [280]:
nums.append(6)
nums

[2, 2, 3, 4, 5, 6, 6]

## Look at Python string methods
list.str.lower() returns each list item in lowercase
list.str.len() returns length of each list item
list.str.split() returns each item separated by a comma where there was a space

## Concat and Append
pd.concat([ser1, ser2]) where ser1 and ser 2 are lists <--merges in series

In [3]:
import pandas as pd
import numpy as np
names = pd.Series(['Jenny Kim', 'Sora Lim', 'Jenny Lee', 'Miae Kim'])


In [4]:
names.upper()

AttributeError: 'Series' object has no attribute 'upper'

# Exercise 1

Consider the following lists:

    lst1 = [1, 2, 3, 5, 8]
    lst2 = [8, 5, 3, 2, 1]

1. Create and display two individual Series objects s1 and s2 from the data available on each list.
2. Perform the following operations with the two series (element-wise):  
    A. Add s1 and s2 and store the result in a new variable s3_add  
    B. Subtract s2 from s1 and store the result in a new variable s3_sub  
    C. Multiply s1 and s2 and store the result in a new variable s3_mul  
    D. Divide s1 by s2 and store the result in a new variable s3_div

In [17]:
import numpy as np
import pandas as pd

lst1 = [1, 2, 3, 5, 8]
lst2 = [8, 5, 3, 2, 1]

# Ex 1:1
# 1. Create and display two individual Series objects s1 and s2 from the data available on each list.

s1 = pd.Series(lst1)
s2 = pd.Series(lst2)

print('s1 ', type(s1), '\n', s1.values)
print('s2 ', type(s2), '\n', s2.values)

# Ex 1:2:A
# Perform the following operations with the two series (element-wise):
# A. Add s1 and s2 and store the result in a new variable s3_add

s3_add = s1+s2

# B. Subtract s2 from s1 and store the result in a new variable s3_sub

s3_sub = s2-s1

# C. Multiply s1 and s2 and store the result in a new variable s3_mul

s3_mul = s1*s2

# D. Divide s1 by s2 and store the result in a new variable s3_div

s3_div = s1/s2

print('add \n', s3_add.values)
print('sub \n', s3_sub.values)
print('mul \n', s3_mul.values)
print('div\n', s3_div.values)


s1  <class 'pandas.core.series.Series'> 
 [1 2 3 5 8]
s2  <class 'pandas.core.series.Series'> 
 [8 5 3 2 1]
add 
 [9 7 6 7 9]
sub 
 [ 7  3  0 -3 -7]
mul 
 [ 8 10  9 10  8]
div
 [0.125 0.4   1.    2.5   8.   ]


# Exercise 2

Consider the following Series object:

    0    45000  
    1    37872  
    2    57923  
    3    68979  
    4    78934  
    5    69897  
    6    56701  
    Name: Amazon_Reviews, dtype: int64

1. Create and display the Amazon_Reviews Series.

2. Get the last three values from Amazon_Reviews using negative indexing.

In [26]:
import pandas as pd
import numpy as np

# Ex 2:1
# Create and display the Amazon_Reviews Series.
Amazon_Reviews = pd.Series([45000,37872,57923,68979,78934,69897,56701])
Amazon_Reviews

0    45000
1    37872
2    57923
3    68979
4    78934
5    69897
6    56701
dtype: int64

In [27]:
# Ex 2:2
# Get the last three values from Amazon_Reviews using negative indexing.

Amazon_Reviews.iloc[-3:]

4    78934
5    69897
6    56701
dtype: int64

# Exercise 3
Consider the following dictionary which is relating the area in sq units of some USA states:  

    area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}  
1. Create a Series using the given dictionary
2. Extract areas for 'Texas', 'New York', and 'Florida' from the created series

In [31]:
import pandas as pd

#Consider the following dictionary which is relating the area in sq units of some USA states:  

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,'Florida': 170312, 'Illinois': 149995}  

# 1. Create a Series using the given dictionary

area_ser = pd.Series(area_dict)
print(area_ser)

# 2. Extract areas for 'Texas', 'New York', and 'Florida' from the created series

area_ser.loc['Texas':'Florida']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64


Texas       695662
New York    141297
Florida     170312
dtype: int64

# Exercise 4

Consider below DPhi Bootcamp's information about different batches:  

    Total_Candidates = {'absolute_beginners': 785, 'beginners': 825, 'intermediat_advanced': 602} # this is true data  
    Active_Candidates = {'absolute_beginners': 500, 'beginners': 425, 'intermediat_advanced': 300}  # this is hypothetical data  
    
1. Create a Pandas DataFrame using above information (name your Dataframe as DPhi)
2. Get all the columns in DPhi.
3. Get the information of total candidates present in each batches using dictionary-style indexing.
4. Find the number of candidates for each batches who are not active and add this information to the dataframe DPhi.
5. Also, find the percent of candidates that are active in each batches and add this information to the DPhi dataframe (hint: $percent = (active / total)* 100$)
6. Get all the batches where percentage of active candidates are greater than 60%

In [45]:
import pandas as pd

# Consider below DPhi Bootcamp's information about different batches:  

Total_Candidates = pd.Series({'absolute_beginners': 785, 'beginners': 825, 'intermediat_advanced': 602}) # this is true data 
Active_Candidates = pd.Series({'absolute_beginners': 500, 'beginners': 425, 'intermediat_advanced': 300})  # this is hypothetical data  
    
# 1. Create a Pandas DataFrame using above information (name your Dataframe as DPhi)

DPhi = pd.DataFrame({'Total': Total_Candidates, 'Active': Active_Candidates})
print('DPhi data frame: \n', DPhi, '\n')

# 2. Get all the columns in DPhi.
print('DPhi columns: \n', DPhi.columns, '\n')

# 3. Get the information of total candidates present in each batches using dictionary-style indexing.

print('DPhi Total candidats: \n', DPhi['Total'],'\n')

# 4. Find the number of candidates for each batches who are not active and add this information to the dataframe DPhi.

DPhi['Not Active'] = DPhi['Total'] - DPhi['Active']
print('DPhi Not Active data frame: \n', DPhi, '\n')

# 5. Also, find the percent of candidates that are active in each batches and add this information to the DPhi dataframe (hint: $percent = (active / total)* 100$)

DPhi ['% active'] = (DPhi['Active']/DPhi['Total'])*100
print('DPhi % Active data frame: \n', DPhi, '\n')

# 6. Get all the batches where percentage of active candidates are greater than 60%

g60 = DPhi['% active']>60
print('DPhi % Active > 60: \n', g60, '\n')


DPhi data frame: 
                       Total  Active
absolute_beginners      785     500
beginners               825     425
intermediat_advanced    602     300 

DPhi columns: 
 Index(['Total', 'Active'], dtype='object') 

DPhi Total candidats: 
 absolute_beginners      785
beginners               825
intermediat_advanced    602
Name: Total, dtype: int64 

DPhi Not Active data frame: 
                       Total  Active  Not Active
absolute_beginners      785     500         285
beginners               825     425         400
intermediat_advanced    602     300         302 

DPhi % Active data frame: 
                       Total  Active  Not Active   % active
absolute_beginners      785     500         285  63.694268
beginners               825     425         400  51.515152
intermediat_advanced    602     300         302  49.833887 

DPhi % Active > 60: 
 absolute_beginners       True
beginners               False
intermediat_advanced    False
Name: % active, dtype: bool 



# Exercise 5

Consider the following lists:

    country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']
    year = [2002, 2002, 1957, 2007, 1967]
    population = [16122830.0, np.nan, 9146100.0, 6426679.0, 6334556.0]
    continent = ['Europe', 'europe', 'Americas', 'asia', 'Africa']

1. Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_info, containing 4 columns and 5 rows.
2. Delete the rows which contains missing values
3. Capitalize all the continents in continent column.
4. Get the length of each country's names.

In [6]:
import pandas as pd
import numpy as np
    
country = pd.Series(['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar'])
year = pd.Series([2002, 2002, 1957, 2007, 1967])
population = pd.Series([16122830.0, np.nan, 9146100.0, 6426679.0, 6334556.0])
continent = pd.Series(['Europe', 'europe', 'Americas', 'asia', 'Africa'])

# 1. Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_info, containing 4 columns and 5 rows.

country_info = pd.DataFrame({'country':country, 'year':year, 'pop': population, 'continent':continent})
#print('country_info data frame \n \n', country_info)
country_info

Unnamed: 0,country,year,pop,continent
0,Netherland,2002,16122830.0,Europe
1,Germany,2002,,europe
2,Peru,1957,9146100.0,Americas
3,Israel,2007,6426679.0,asia
4,Madagascar,1967,6334556.0,Africa


In [7]:
# 2. Delete the rows which contains missing values

drop = country_info.dropna()
#print('\n drop row with missing values \n \n', drop)
drop


Unnamed: 0,country,year,pop,continent
0,Netherland,2002,16122830.0,Europe
2,Peru,1957,9146100.0,Americas
3,Israel,2007,6426679.0,asia
4,Madagascar,1967,6334556.0,Africa


In [8]:
# 3. Capitalize all the continents in continent column.

#continent.str.capitalize() #capitalizes the continent series
country_info['continent'] = country_info['continent'].str.capitalize()
country_info


Unnamed: 0,country,year,pop,continent
0,Netherland,2002,16122830.0,Europe
1,Germany,2002,,Europe
2,Peru,1957,9146100.0,Americas
3,Israel,2007,6426679.0,Asia
4,Madagascar,1967,6334556.0,Africa


In [9]:
# 4. Get the length of each country's names.

print('\n length of country name \n \n', country.str.len())



 length of country name 
 
 0    10
1     7
2     4
3     6
4    10
dtype: int64


# Exercise 6
Consider the following lists:

country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']
gdp_per_cap = [33724.757780, 30035.801980, 4245.256698, 25523.277100, 1634.047282]
1. Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_gdp, containing 2 columns and 5 rows.
2. Concatenate the two dataframes: country_info and country_gdp with axis=0 and name it concat_data
3. Check if there are any null values in concat_data
4. Find total numer of missing values in each column. hint: Use .isnull() and .sum() functions

In [12]:
import pandas as pd

# Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_gdp, containing 2 columns and 5 rows.

country = pd.Series(['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar'])
gdp_per_cap = pd.Series([33724.757780, 30035.801980, 4245.256698, 25523.277100, 1634.047282])

country_gdp = pd.DataFrame({"country": country, "GDP": gdp_per_cap})
country_gdp

Unnamed: 0,country,GDP
0,Netherland,33724.75778
1,Germany,30035.80198
2,Peru,4245.256698
3,Israel,25523.2771
4,Madagascar,1634.047282


In [23]:
# 2. Concatenate the two dataframes: country_info and country_gdp with axis=0 and name it concat_data
concat_data = pd.concat([country_info, country_gdp], axis=0)
#, columns=['country', 'continent', 'year', 'pop', 'gdp'])
concat_data

Unnamed: 0,country,year,pop,continent,GDP
0,Netherland,2002.0,16122830.0,Europe,
1,Germany,2002.0,,Europe,
2,Peru,1957.0,9146100.0,Americas,
3,Israel,2007.0,6426679.0,Asia,
4,Madagascar,1967.0,6334556.0,Africa,
0,Netherland,,,,33724.75778
1,Germany,,,,30035.80198
2,Peru,,,,4245.256698
3,Israel,,,,25523.2771
4,Madagascar,,,,1634.047282


In [17]:
# 3. Check if there are any null values in concat_data
pd.isnull(concat_data)

Unnamed: 0,country,year,pop,continent,GDP
0,False,False,False,False,True
1,False,False,True,False,True
2,False,False,False,False,True
3,False,False,False,False,True
4,False,False,False,False,True
0,False,True,True,True,False
1,False,True,True,True,False
2,False,True,True,True,False
3,False,True,True,True,False
4,False,True,True,True,False


In [29]:
merge_outer = pd.merge(country_info, country_gdp, how="outer")
merge_outer

Unnamed: 0,country,year,pop,continent,GDP
0,Netherland,2002,16122830.0,Europe,33724.75778
1,Germany,2002,,Europe,30035.80198
2,Peru,1957,9146100.0,Americas,4245.256698
3,Israel,2007,6426679.0,Asia,25523.2771
4,Madagascar,1967,6334556.0,Africa,1634.047282


In [30]:
merge_inner = pd.merge(country_info, country_gdp, how="inner")
merge_inner

Unnamed: 0,country,year,pop,continent,GDP
0,Netherland,2002,16122830.0,Europe,33724.75778
1,Germany,2002,,Europe,30035.80198
2,Peru,1957,9146100.0,Americas,4245.256698
3,Israel,2007,6426679.0,Asia,25523.2771
4,Madagascar,1967,6334556.0,Africa,1634.047282


In [32]:
# 4. Find total numer of missing values in each column. hint: Use .isnull() and .sum() functions
null_data = pd.isnull(concat_data)
null_data

Unnamed: 0,country,year,pop,continent,GDP
0,False,False,False,False,True
1,False,False,True,False,True
2,False,False,False,False,True
3,False,False,False,False,True
4,False,False,False,False,True
0,False,True,True,True,False
1,False,True,True,True,False
2,False,True,True,True,False
3,False,True,True,True,False
4,False,True,True,True,False


In [33]:
# 4. Find total numer of missing values in each column. hint: Use .isnull() and .sum() functions
null_data.sum()

country      0
year         5
pop          6
continent    5
GDP          5
dtype: int64