# <h1><center>Introduction to Pandas</center></h1>

In [1]:
import pandas as pd
import numpy as np


## 1. Data structures
### Series & DataFrame

## Series

In [2]:
series_ex = pd.Series([6,5,4,3])
series_ex

0    6
1    5
2    4
3    3
dtype: int64

In [4]:
type(series_ex)

pandas.core.series.Series

In [5]:
series_ex * 2

0    12
1    10
2     8
3     6
dtype: int64

In [6]:
series_ex_index = pd.Series([6,5,4,3],index=['a','b','c','d'])
series_ex_index

a    6
b    5
c    4
d    3
dtype: int64

In [7]:
series_ex_index["a"]

6

In [8]:
series_ex_index["d"]

3

In [9]:
series_ex_index.values

array([6, 5, 4, 3], dtype=int64)

## DataFrame

In [10]:
teams = ['Real Madrid', 'Roma', 'Liverpool', 'Bayern Munchen']
champions_league_cups = [12,0, 5, 5]
established = [1902, 1927, 1892, 1900]



In [11]:
teams

['Real Madrid', 'Roma', 'Liverpool', 'Bayern Munchen']

In [12]:
champions_league_cups

[12, 0, 5, 5]

In [13]:
df = pd.DataFrame({'teams' : teams, 
                   'cups' : champions_league_cups, 
                   'Year': established},  
                  
                   index=range(1,5)
                 )
df

Unnamed: 0,teams,cups,Year
1,Real Madrid,12,1902
2,Roma,0,1927
3,Liverpool,5,1892
4,Bayern Munchen,5,1900


In [14]:
df.head(2)

Unnamed: 0,teams,cups,Year
1,Real Madrid,12,1902
2,Roma,0,1927


In [15]:
df.tail(2)

Unnamed: 0,teams,cups,Year
3,Liverpool,5,1892
4,Bayern Munchen,5,1900


In [16]:
df.shape

(4, 3)

## 2. Read Data with Pandas

* CSV 

In [17]:
df = pd.read_csv('data/worldcitiespop.csv', encoding = "ISO-8859-1", low_memory=False)

In [18]:
df.shape

(2699353, 7)

In [19]:
df.head(7)

Unnamed: 0,ad,aixas,Aixàs,06,Unnamed: 4,42.4833333,1.4666667
0,ad,aixirivali,Aixirivali,6,,42.466667,1.5
1,ad,aixirivall,Aixirivall,6,,42.466667,1.5
2,ad,aixirvall,Aixirvall,6,,42.466667,1.5
3,ad,aixovall,Aixovall,6,,42.466667,1.483333
4,ad,andorra,Andorra,7,,42.5,1.516667
5,ad,andorra la vella,Andorra la Vella,7,20430.0,42.5,1.516667
6,ad,andorra-vieille,Andorra-Vieille,7,,42.5,1.516667


In [20]:
df.sample(12)

Unnamed: 0,ad,aixas,Aixàs,06,Unnamed: 4,42.4833333,1.4666667
2130778,ru,yegorye,Yegorye,25,,54.048611,35.578333
1460606,ml,tira,Tira,04,,13.633333,-3.85
1825160,pk,mudwala khurd,Mudwala Khurd,04,,31.635,74.074722
357723,cm,mboa,Mboa,04,,4.416667,14.266667
1153354,ir,reza kan,Reza Kan,31,,30.348611,53.896667
2657979,za,ntlavini,Ntlavini,05,,-30.716667,29.416667
320962,cf,batimbilika,Batimbilika,08,,4.95,22.966667
793321,fr,saint-roch,Saint-Roch,B8,,43.65,5.25
1525669,mx,el habillal,El Habillal,16,,18.0,-102.366667
1388796,ma,agdal oumerzgoum,Agdal Oumerzgoum,32,,29.916667,-9.616667


In [21]:
df = pd.read_csv('data/worldcitiespop.csv',
        names=["Country", "City", "City_local", "Region", "Population", "Latitude","Longitude"],
                 encoding = "ISO-8859-1", 
                 low_memory=False,
            )

In [22]:
df.head()

Unnamed: 0,Country,City,City_local,Region,Population,Latitude,Longitude
0,ad,aixas,Aixàs,6,,42.483333,1.466667
1,ad,aixirivali,Aixirivali,6,,42.466667,1.5
2,ad,aixirivall,Aixirivall,6,,42.466667,1.5
3,ad,aixirvall,Aixirvall,6,,42.466667,1.5
4,ad,aixovall,Aixovall,6,,42.466667,1.483333


In [23]:
df.columns

Index(['Country', 'City', 'City_local', 'Region', 'Population', 'Latitude',
       'Longitude'],
      dtype='object')

* HTML

In [24]:
url = 'https://en.wikipedia.org/wiki/UEFA_Champions_League'

html_df = pd.read_html(url, header=0, index_col=0)[3]

In [25]:
html_df

Unnamed: 0_level_0,Titles,Runners-up,Total
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spain,18,11,29
England,13,9,22
Italy,12,16,28
Germany[A],7,10,17
Netherlands,6,2,8
Portugal,4,5,9
France,1,5,6
Romania,1,1,2
Scotland,1,1,2
Serbia[B],1,1,2


## 3. Descriptive Statistics

In [26]:
df.shape

(2699354, 7)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2699354 entries, 0 to 2699353
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Country     object 
 1   City        object 
 2   City_local  object 
 3   Region      object 
 4   Population  float64
 5   Latitude    float64
 6   Longitude   float64
dtypes: float64(3), object(4)
memory usage: 144.2+ MB


In [28]:
df.describe()

Unnamed: 0,Population,Latitude,Longitude
count,47004.0,2699354.0,2699354.0
mean,48730.66,28.37168,28.14618
std,308417.3,21.93838,62.47287
min,7.0,-54.93333,-179.9833
25%,3730.75,12.95535,2.383333
50%,10879.0,33.86667,26.88028
75%,28229.25,45.53056,69.63333
max,31480500.0,82.48333,180.0


In [29]:
df.Population.min()

7.0

In [30]:
df["Population"].max()

31480498.0

In [None]:
# Which city has the highest population

df.loc[df['Population'].argmax()]