# Concept 1.3: Pandas - So much more than a cute animal 

## - Introducing Pandas data structures: Series, DataFrames and Index objects.   

In [1]:
#Convention for Importing Pandas
import pandas as pd
days = pd.Series(['Monday','Tuesday','Wednesday'])
print(days)

0       Monday
1      Tuesday
2    Wednesday
dtype: object


In [4]:
# creating series with a numpy array
import numpy as np
days_list = np.array(['Monday', 'Tuesday', 'Wednesday']) 
numpy_days = pd.Series(days_list) 
print(numpy_days) # prints

0       Monday
1      Tuesday
2    Wednesday
dtype: object


In [5]:
# using strings as index
days = pd.Series(['Monday', 'Tuesday', 'Wednesday'],   index=['a', 'b', 'c']) 

In [7]:
# create series from a dictionary 
days1 = pd.Series({'a':'Monday', 'b':'Tuesday', 'c':'Wednesday'}) 

In [8]:
days 

a       Monday
b      Tuesday
c    Wednesday
dtype: object

In [9]:
days1

a       Monday
b      Tuesday
c    Wednesday
dtype: object

In [11]:
#Series can be accessed using the specified index as shown below  
days[1]

'Tuesday'

In [19]:
days[0:2]   # 0th Index Monday , 1st Index Tuesday ..... Slicing works following way. Left to the ':' is from the index and to the right is to the end of index 

a     Monday
b    Tuesday
dtype: object

# DataFrame

## A DataFrame can be described as a table (2 dimensions) made up of many series with the same index..  

In [20]:
print(pd.DataFrame()) # prints an empty dataframe 

Empty DataFrame
Columns: []
Index: []


In [22]:
# create a dataframe from a dictionary
df_dict = {'Country': ['Ghana', 'Kenya', 'Nigeria', 'Togo'],
'Capital': ['Accra', 'Nairobi', 'Abuja', 'Lome'],
'Population': [10000, 8500, 35000, 12000],
'Age': [60, 70, 80, 75]
}

In [23]:
df = pd.DataFrame(df_dict, index=[2, 4, 6, 8])

In [24]:
#Create a Dataframe as a List
df_list = [['Ghana', 'Accra', 10000, 60],
['Kenya', 'Nairobi', 8500, 70],
['Nigeria', 'Abuja', 35000, 80],
['Togo', 'Lome', 12000, 75]]

In [25]:
df1 = pd.DataFrame(df_list, columns=['Country', 'Capital','Population', 'Age'],
index=[2, 4, 6, 8])

In [29]:
df1

Unnamed: 0,Country,Capital,Population,Age
2,Ghana,Accra,10000,60
4,Kenya,Nairobi,8500,70
6,Nigeria,Abuja,35000,80
8,Togo,Lome,12000,75


In [32]:
# select the row in the at index 3
df.iloc[3]

Country        Togo
Capital        Lome
Population    12000
Age              75
Name: 8, dtype: object

In [33]:
# select row with index label 6
df.loc[6] 

Country       Nigeria
Capital         Abuja
Population      35000
Age                80
Name: 6, dtype: object

In [34]:
df['Capital']

2      Accra
4    Nairobi
6      Abuja
8       Lome
Name: Capital, dtype: object

In [35]:
df.at[6, 'Country']

'Nigeria'

In [37]:
df.iat[2, 0]

'Nigeria'

## The describe function gives the summary of the numeric columns in a dataframe displaying count, mean, standard deviation, interquartile range, minimum and maximum values.


In [38]:
df['Population'].sum()

65500

In [39]:
df.mean()

Population    16375.00
Age              71.25
dtype: float64

In [40]:
df.describe()

Unnamed: 0,Population,Age
count,4.0,4.0
mean,16375.0,71.25
std,12499.166639,8.539126
min,8500.0,60.0
25%,9625.0,67.5
50%,11000.0,72.5
75%,17750.0,76.25
max,35000.0,80.0


### Dealing with Missing Data

In [41]:
df_dict2 = {'Name': ['James', 'Yemen', 'Caro', np.nan],
           'Profession': ['Researcher', 'Artist', 'Doctor', 'Writer'],
           'Experience': [12, np.nan, 10, 8],
           'Height': [np.nan, 175, 180, 150]}

In [42]:
new_df = pd.DataFrame(df_dict2)

In [45]:
# check for cells with missing values as True
new_df.isnull()

Unnamed: 0,Name,Profession,Experience,Height
0,False,False,False,True
1,False,False,True,False
2,False,False,False,False
3,True,False,False,False


In [46]:
#Check Columns for Missing Values
new_df.isnull().sum()

Name          1
Profession    0
Experience    1
Height        1
dtype: int64

In [47]:
# remove rows with missing values
new_df.dropna()

Unnamed: 0,Name,Profession,Experience,Height
2,Caro,Doctor,10.0,180.0
