# <h1><center>Introduction to Pandas</center></h1>

In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('data/worldcitiespop.csv',
        names=["Country", "City", "City_local", "Region", "Population", "Latitude","Longitude"],
                 encoding = "ISO-8859-1", 
                 low_memory=False,
            )
df.head()

Unnamed: 0,Country,City,City_local,Region,Population,Latitude,Longitude
0,ad,aixas,Aixàs,6,,42.483333,1.466667
1,ad,aixirivali,Aixirivali,6,,42.466667,1.5
2,ad,aixirivall,Aixirivall,6,,42.466667,1.5
3,ad,aixirvall,Aixirvall,6,,42.466667,1.5
4,ad,aixovall,Aixovall,6,,42.466667,1.483333


In [3]:
df.shape

(2699354, 7)

## 1. Missing Values

In [4]:

df.isnull().any()

Country       False
City           True
City_local    False
Region         True
Population     True
Latitude      False
Longitude     False
dtype: bool

In [5]:
df.isnull().sum()

Country             0
City                5
City_local          0
Region              4
Population    2652350
Latitude            0
Longitude           0
dtype: int64

In [6]:
df.isnull().sum().sort_values(ascending=False)

Population    2652350
City                5
Region              4
Longitude           0
Latitude            0
City_local          0
Country             0
dtype: int64

## Handle Missing Data

In [7]:
df.shape

(2699354, 7)

In [8]:
# Drop the rows where at least one element is missing.
df_clean_rows = df.dropna()
df_clean_rows.head()

Unnamed: 0,Country,City,City_local,Region,Population,Latitude,Longitude
6,ad,andorra la vella,Andorra la Vella,7,20430.0,42.5,1.516667
20,ad,canillo,Canillo,2,3292.0,42.566667,1.6
32,ad,encamp,Encamp,3,11224.0,42.533333,1.583333
49,ad,la massana,La Massana,4,7211.0,42.55,1.516667
53,ad,les escaldes,Les Escaldes,8,15854.0,42.5,1.533333


In [9]:
df_clean_rows.shape

(47003, 7)

In [10]:
# Drop the columns where at least one element is missing.
df_clean_cols = df.dropna(axis="columns")
df_clean_cols.head()

Unnamed: 0,Country,City_local,Latitude,Longitude
0,ad,Aixàs,42.483333,1.466667
1,ad,Aixirivali,42.466667,1.5
2,ad,Aixirivall,42.466667,1.5
3,ad,Aixirvall,42.466667,1.5
4,ad,Aixovall,42.466667,1.483333


In [11]:
df_clean_cols.shape

(2699354, 4)

In [12]:
df.shape[0] - df_clean_cols.shape[0]

0

* Filling Missing Values

In [13]:
# Fill the missing population rows with the minimum population in the dataset
df_filled = df.copy()
df_filled['Population'] = df_filled['Population'].fillna(df_filled['Population'].min())

In [14]:
df_filled.isnull().any()

Country       False
City           True
City_local    False
Region         True
Population    False
Latitude      False
Longitude     False
dtype: bool

## 2. Selecting Subsets

In [15]:
# Columns
df_subset = df[['City', 'Population']]
df_subset.head(7)

Unnamed: 0,City,Population
0,aixas,
1,aixirivali,
2,aixirivall,
3,aixirvall,
4,aixovall,
5,andorra,
6,andorra la vella,20430.0


In [16]:
# ROws
pop_greater_10m = df[df['Population'] > 10000000]
pop_greater_10m

Unnamed: 0,Country,City,City_local,Region,Population,Latitude,Longitude
269851,br,sao paulo,São Paulo,27,10021437.0,-23.533333,-46.616667
437672,cn,shanghai,Shanghai,23,14608512.0,31.005,121.408611
1024396,in,bombay,Bombay,16,12692717.0,18.975,72.825833
1027128,in,delhi,Delhi,07,10928270.0,28.666667,77.216667
1041387,in,new delhi,New Delhi,07,10928270.0,28.6,77.2
1217910,jp,tokyo,Tokyo,40,31480498.0,35.685,139.751389
1285134,kr,seoul,Seoul,11,10323448.0,37.566389,126.999722
1759460,ph,manila,Manila,D9,10443877.0,14.604167,120.982222
1812035,pk,karachi,Karachi,05,11627378.0,24.866667,67.05
2048592,ru,moscow,Moscow,48,10381288.0,55.752222,37.615556


In [17]:
pop_10m_sorted = pop_greater_10m.sort_values(by='Population', ascending=False)

In [18]:
pop_10m_sorted[:3]

Unnamed: 0,Country,City,City_local,Region,Population,Latitude,Longitude
1217910,jp,tokyo,Tokyo,40,31480498.0,35.685,139.751389
437672,cn,shanghai,Shanghai,23,14608512.0,31.005,121.408611
1024396,in,bombay,Bombay,16,12692717.0,18.975,72.825833


## 3. Delete Columns

In [19]:
pop_greater_10m.drop(['Latitude', 'Longitude'], axis=1).head(2)

Unnamed: 0,Country,City,City_local,Region,Population
269851,br,sao paulo,São Paulo,27,10021437.0
437672,cn,shanghai,Shanghai,23,14608512.0


In [20]:
del pop_greater_10m['City_local']
pop_greater_10m.head(2)

Unnamed: 0,Country,City,Region,Population,Latitude,Longitude
269851,br,sao paulo,27,10021437.0,-23.533333,-46.616667
437672,cn,shanghai,23,14608512.0,31.005,121.408611


In [21]:

pop_greater_10m

Unnamed: 0,Country,City,Region,Population,Latitude,Longitude
269851,br,sao paulo,27,10021437.0,-23.533333,-46.616667
437672,cn,shanghai,23,14608512.0,31.005,121.408611
1024396,in,bombay,16,12692717.0,18.975,72.825833
1027128,in,delhi,07,10928270.0,28.666667,77.216667
1041387,in,new delhi,07,10928270.0,28.6,77.2
1217910,jp,tokyo,40,31480498.0,35.685,139.751389
1285134,kr,seoul,11,10323448.0,37.566389,126.999722
1759460,ph,manila,D9,10443877.0,14.604167,120.982222
1812035,pk,karachi,05,11627378.0,24.866667,67.05
2048592,ru,moscow,48,10381288.0,55.752222,37.615556


In [22]:
pop_greater_10m.drop(pop_greater_10m.index[3:8])


Unnamed: 0,Country,City,Region,Population,Latitude,Longitude
269851,br,sao paulo,27,10021437.0,-23.533333,-46.616667
437672,cn,shanghai,23,14608512.0,31.005,121.408611
1024396,in,bombay,16,12692717.0,18.975,72.825833
1812035,pk,karachi,5,11627378.0,24.866667,67.05
2048592,ru,moscow,48,10381288.0,55.752222,37.615556


## 4. Inserting Rows and Columns

In [23]:
pop_greater_10m.head()

Unnamed: 0,Country,City,Region,Population,Latitude,Longitude
269851,br,sao paulo,27,10021437.0,-23.533333,-46.616667
437672,cn,shanghai,23,14608512.0,31.005,121.408611
1024396,in,bombay,16,12692717.0,18.975,72.825833
1027128,in,delhi,7,10928270.0,28.666667,77.216667
1041387,in,new delhi,7,10928270.0,28.6,77.2


In [24]:
countries = ['Brazil', 'China', 'India', 'India', 'India', 'Japan', 'South Korea', 'Philippines', 'Pakistan', 'Rusia']
pop_greater_10m.loc[:, 'CountryName'] = countries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [27]:
pop_greater_10m

Unnamed: 0,Country,City,Region,Population,Latitude,Longitude,CountryName
269851,br,sao paulo,27,10021437.0,-23.533333,-46.616667,Brazil
437672,cn,shanghai,23,14608512.0,31.005,121.408611,China
1024396,in,bombay,16,12692717.0,18.975,72.825833,India
1027128,in,delhi,07,10928270.0,28.666667,77.216667,India
1041387,in,new delhi,07,10928270.0,28.6,77.2,India
1217910,jp,tokyo,40,31480498.0,35.685,139.751389,Japan
1285134,kr,seoul,11,10323448.0,37.566389,126.999722,South Korea
1759460,ph,manila,D9,10443877.0,14.604167,120.982222,Philippines
1812035,pk,karachi,05,11627378.0,24.866667,67.05,Pakistan
2048592,ru,moscow,48,10381288.0,55.752222,37.615556,Rusia
