# Introduction
Welcome to the Pandas track. These hands-on exercises are targeted for someone who has worked with Pandas a little before. 
Each page has a list of `relevant resources` you can use if you get stumped. The top item in each list has been custom-made to help you with the exercises on that page.

The first step in most data analytics projects is reading the data file. In this section, you'll create `Series` and `DataFrame` objects, both by hand and by reading data files.



In [0]:
import numpy as np
import pandas as pd # This imports pandas


In [0]:
first_series = pd.Series(list('abcdef'))

In [4]:
print(first_series)

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


In [0]:
# Create a series from an array
np_country = np.array(['Luxembourg','Norway','Japan','Switzerland','US','Qatar','Iceland','Sweden','Singapore','Denmark'])

In [0]:
s_country = pd.Series(np_country) # Convert Numpy array to Series

In [9]:
print(s_country)

0     Luxembourg
1         Norway
2          Japan
3    Switzerland
4             US
5          Qatar
6        Iceland
7         Sweden
8      Singapore
9        Denmark
dtype: object


In [10]:
#Create a series from a scalar
scalar_series = pd.Series(5., index = ['a','b','c','d','e'])
scalar_series


a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [0]:
#Access elements in a series
# First Checkpoint
dict_country_gdp.loc['US']
dict_country_gdp.iloc[0]


In [0]:
#Vectorized operation in series
first_vector_series = pd.Series([1,2,3,4],index = ['a','b','c','d'])
second_vector_series = pd.Series([10,20,30,40],index = ['a','b','c','d'])


In [17]:
first_vector_series + second_vector_series

a    11
b    22
c    33
d    44
dtype: int64

In [19]:
second_vector_series = pd.Series([10,20,30,40], index = ['a','d','b','c'])
first_vector_series + second_vector_series

a    11
b    32
c    43
d    24
dtype: int64

In [22]:
# Replace few indexes with new ones in second vector series
second_vector_series = pd.Series([10,20,30,40], index = ['a','b','e','f'])
first_vector_series + second_vector_series

a    11.0
b    22.0
c     NaN
d     NaN
e     NaN
f     NaN
dtype: float64

In [0]:
# Create a dataframe

import pandas as pd
olympic_data_list = {"HostCity":["London","Beijing","Athens",'Sydney','Atlanta'],
                    'Year':[2012,2008,2004,2000,1996],
                    "No. of participating countries":[205,204,201,200,197]}

In [0]:
df_olympic_data = pd.DataFrame(olympic_data_list)

In [25]:
df_olympic_data

Unnamed: 0,HostCity,No. of participating countries,Year
0,London,205,2012
1,Beijing,204,2008
2,Athens,201,2004
3,Sydney,200,2000
4,Atlanta,197,1996


In [0]:
# Create a dataframe from a series of Dictionaries

olympic_data_dict = {'London':{2012:205}, 
                     'Beijing':{2008:204}}

In [33]:
df_olympic_data_dict = pd.DataFrame(olympic_data_dict)
df_olympic_data_dict

Unnamed: 0,Beijing,London
2008,204.0,
2012,,205.0


In [34]:
#View DataFrame
df_olympic_data.HostCity

0     London
1    Beijing
2     Athens
3     Sydney
4    Atlanta
Name: HostCity, dtype: object

In [35]:
#Describe DataFrame
df_olympic_data.describe

<bound method NDFrame.describe of   HostCity  No. of participating countries  Year
0   London                             205  2012
1  Beijing                             204  2008
2   Athens                             201  2004
3   Sydney                             200  2000
4  Atlanta                             197  1996>

In [0]:
# Create df from a dict of series

olympic_series_participation = pd.Series([205,204,201,200,197], index = [2012,2008,2004,2000,1996])
olympic_series_country = pd.Series(['London','Beijing','Athens','Sydney','Atlanta'], index = [2012,2008,2004,2000,1996])

In [0]:
import pandas as pd
df_olympic_series = pd.DataFrame({'No. of Participating Countries': olympic_series_participation,
                                   'Host Cities': olympic_series_country})

In [42]:
df_olympic_series

Unnamed: 0,Host Cities,No. of Participating Countries
2012,London,205
2008,Beijing,204
2004,Athens,201
2000,Sydney,200
1996,Atlanta,197


In [0]:
# Checkpoint # Please work this in your notebook!

# Create a dataframe from an ndarray
import numpy as np
np_array = np.array ([2012, 2008, 2004, 2006])
dict_ndarray = {'year':np_array}


In [0]:
df_ndarray = pd.DataFrame(dict_ndarray)

In [46]:
df_ndarray

Unnamed: 0,year
0,2012
1,2008
2,2004
3,2006


In [0]:
# Create a df from a df

df_from_df = pd.DataFrame(df_olympic_series)

In [48]:
df_from_df

Unnamed: 0,Host Cities,No. of Participating Countries
2012,London,205
2008,Beijing,204
2004,Athens,201
2000,Sydney,200
1996,Atlanta,197


In [0]:
# Handling Missing Values

first_series = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])

second_series = pd.Series([10,20,30,40,50], index = ['c','e','f','g','h'])

sum_of_series = first_series + second_series

In [51]:
sum_of_series # Sometimes missing values can be 'NaN', '?', '0'. It depends on the dataset

a     NaN
b     NaN
c    13.0
d     NaN
e    25.0
f     NaN
g     NaN
h     NaN
dtype: float64

In [0]:
# Handling missing values with Functions
#dropna
dropna_s = sum_of_series.dropna() # Drop all rows with missing values

In [53]:
dropna_s

c    13.0
e    25.0
dtype: float64

In [0]:
# Fillna function
fillna_s = sum_of_series.fillna(0)

In [55]:
fillna_s

a     0.0
b     0.0
c    13.0
d     0.0
e    25.0
f     0.0
g     0.0
h     0.0
dtype: float64

In [0]:
fill_NaN_with_zeros_before_sum = first_series.add(second_series, fill_value =0)

In [57]:
fill_NaN_with_zeros_before_sum

a     1.0
b     2.0
c    13.0
d     4.0
e    25.0
f    30.0
g    40.0
h    50.0
dtype: float64