# Pandas

## Install requirements


In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Import Pandas

In [2]:
import pandas as pd

## Read CSV of universities rankings

In [3]:
ds = pd.read_csv('datasets/universites_rankings.csv')

## Read first 5 rows of dataframe / dataset

In [4]:
ds.head()

Unnamed: 0,Year,Rank,Name,Point,City,Country
0,2018,1,Harvard University,97.7,Cambridge,United States
1,2018,2,University of Cambridge,94.6,Cambridge,United Kingdom
2,2018,2,University of Oxford,94.6,Oxford,United Kingdom
3,2018,4,Massachusetts Institute of Technology (MIT),92.5,Cambridge,United States
4,2018,5,Johns Hopkins University,92.1,Baltimore,United States


## Read last 5 rows of dataframe / dataset

In [5]:
ds.tail()

Unnamed: 0,Year,Rank,Name,Point,City,Country
5245,2022,346,National Cheng Kung University (NCKU),60.7,Tainan City,Taiwan
5246,2022,346,University of New Mexico,60.7,Albuquerque,United States
5247,2022,348,Universitas Indonesia,60.6,Depok,Indonesia
5248,2022,349,Aga Khan University,60.5,Karachi,Pakistan
5249,2022,349,Northeastern University,60.5,Boston,United States


## Create own dataset

In [6]:
cdf = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['Haroon Mahmood', 'Madiha Hussain', 'Ali', 'Noor', 'Alex'],
    'profession': ['Software Engineer', 'Future Doctor', 'Wordpress Developer', 'SQA Engineer', 'Tiktoker']
})

In [7]:
cdf.head()

Unnamed: 0,id,name,profession
0,1,Haroon Mahmood,Software Engineer
1,2,Madiha Hussain,Future Doctor
2,3,Ali,Wordpress Developer
3,4,Noor,SQA Engineer
4,5,Alex,Tiktoker


## Describe dataset like count, mean, std, etc

In [8]:
ds.describe()

Unnamed: 0,Year,Rank,Point
count,5250.0,5250.0,5030.0
mean,2020.666667,174.803429,70.84004
std,1.247338,100.807265,8.128413
min,2018.0,1.0,57.5
25%,2020.0,88.0,64.5
50%,2021.0,175.0,69.1
75%,2022.0,262.0,76.0
max,2022.0,350.0,99.0


In [9]:
ds.Year.describe()

count    5250.000000
mean     2020.666667
std         1.247338
min      2018.000000
25%      2020.000000
50%      2021.000000
75%      2022.000000
max      2022.000000
Name: Year, dtype: float64

## Find unique values from dataset for specific column 

In [10]:
ds.Year.unique()

array([2018, 2019, 2020, 2021, 2022])

In [11]:
ds.Year.dtype

dtype('int64')

In [12]:
ds.dtypes

Year         int64
Rank         int64
Name        object
Point      float64
City        object
Country     object
dtype: object

## Find Missing / Null Values from Dataframe

In [13]:
ds.isnull().sum()

Year         0
Rank         0
Name         0
Point      220
City         0
Country      1
dtype: int64

### Drop null values column wise

In [77]:
ds.dropna(inplace=True, axis=1)
ds.isnull().sum()

Year    0
Rank    0
Name    0
City    0
dtype: int64

### Drop null values row wise

In [14]:
ds.dropna(inplace=True, axis=0)
ds.isnull().sum()

Year       0
Rank       0
Name       0
Point      0
City       0
Country    0
dtype: int64

In [15]:
ds.head()

Unnamed: 0,Year,Rank,Name,Point,City,Country
0,2018,1,Harvard University,97.7,Cambridge,United States
1,2018,2,University of Cambridge,94.6,Cambridge,United Kingdom
2,2018,2,University of Oxford,94.6,Oxford,United Kingdom
3,2018,4,Massachusetts Institute of Technology (MIT),92.5,Cambridge,United States
4,2018,5,Johns Hopkins University,92.1,Baltimore,United States


In [16]:
ds.shape

(5029, 6)

### Fill the null values with something

In [17]:
ds.fillna(0, inplace=True)

In [18]:
ds.columns

Index(['Year', 'Rank', 'Name', 'Point', 'City', 'Country'], dtype='object')

## Pandas Indexing

In [20]:
ds[['Year']].head()

Unnamed: 0,Year
0,2018
1,2018
2,2018
3,2018
4,2018


In [21]:
ds[['Year', 'Rank', 'Name']]

Unnamed: 0,Year,Rank,Name
0,2018,1,Harvard University
1,2018,2,University of Cambridge
2,2018,2,University of Oxford
3,2018,4,Massachusetts Institute of Technology (MIT)
4,2018,5,Johns Hopkins University
...,...,...,...
5245,2022,346,National Cheng Kung University (NCKU)
5246,2022,346,University of New Mexico
5247,2022,348,Universitas Indonesia
5248,2022,349,Aga Khan University


In [24]:
ds.iloc[555:]

Unnamed: 0,Year,Rank,Name,Point,City,Country
579,2019,229,University of Naples - Federico II,64.5,Naples,Italy
580,2019,231,Tel Aviv University,64.4,Tel Aviv,Israel
581,2019,231,University at Buffalo SUNY,64.4,Buffalo,United States
582,2019,233,The University of Warwick,64.3,Coventry,United Kingdom
583,2019,234,Universiti Sains Malaysia (USM),64.1,Gelugor,Malaysia
...,...,...,...,...,...,...
5245,2022,346,National Cheng Kung University (NCKU),60.7,Tainan City,Taiwan
5246,2022,346,University of New Mexico,60.7,Albuquerque,United States
5247,2022,348,Universitas Indonesia,60.6,Depok,Indonesia
5248,2022,349,Aga Khan University,60.5,Karachi,Pakistan


In [27]:
ds.loc[ds['City'] == 'Lahore'].count()

Year       0
Rank       0
Name       0
Point      0
City       0
Country    0
dtype: int64