## `Pandas`
`Pandas` is a powerful, open-source Python library used for:

- Data manipulation

- Data analysis

- Cleaning and transforming structured data (like CSV, Excel, SQL, etc.)

- It is built on top of `NumPy` and is widely used in data science, AI, and machine learning.

In [1]:
import pandas as pd
import numpy as np

### `Series:`
- A 1-dimensional labeled array (like a single column in Excel)

- Can hold any data type (int, float, string, etc.)

#### `Series from List`

In [3]:
# creating a pandas series using list of strings

names = ['Ali', 'John', 'Bob', 'Max', 'Jak', 'Alex']
print(names)
pd.Series(names)

['Ali', 'John', 'Bob', 'Max', 'Jak', 'Alex']


0     Ali
1    John
2     Bob
3     Max
4     Jak
5    Alex
dtype: object

In [4]:
salary = [700, 650, 1000, 1200, 300, 500]
pd.Series(salary)

0     700
1     650
2    1000
3    1200
4     300
5     500
dtype: int64

In [5]:
# Custom indices
salary = [700, 650, 1000, 1200, 300, 500]
names = ['Ali', 'John', 'Bob', 'Max', 'Jak', 'Alex']

emp_salary = pd.Series(salary, index = names, name = 'Salary of all Employes')
emp_salary

Ali      700
John     650
Bob     1000
Max     1200
Jak      300
Alex     500
Name: Salary of all Employes, dtype: int64

#### `Series from dict`

In [6]:
salary = {
    'Alex' : 2300,
    'Ben' : 2000,
    'Charlie' : 1200,
    'David' : 1500,
    'John' : 3000
}
salary

{'Alex': 2300, 'Ben': 2000, 'Charlie': 1200, 'David': 1500, 'John': 3000}

In [14]:
salary_series = pd.Series(salary, name = "Employee Salaries")
salary_series

Alex       2300
Ben        2000
Charlie    1200
David      1500
John       3000
Name: Employee Salaries, dtype: int64

#### `Series Attributes`

In [15]:
salary_series

Alex       2300
Ben        2000
Charlie    1200
David      1500
John       3000
Name: Employee Salaries, dtype: int64

In [16]:
#size (no: of items)
salary_series.size


5

In [None]:
# dtype
salary_series.dtype

dtype('int64')

In [None]:
# name
salary_series.name

'Employee Salaries'

In [None]:
# is_unique
print(salary_series.is_unique)
print(pd.Series([10,20,30,40,20,50,30]).is_unique)

True
False


In [None]:
# index
salary_series.index

Index(['Alex', 'Ben', 'Charlie', 'David', 'John'], dtype='object')

In [30]:
names = ['Ali', 'John', 'Bob', 'Max', 'Jak', 'Alex']
names = pd.Series(names)
names.index

RangeIndex(start=0, stop=6, step=1)

In [33]:
# Values
salary_series.values

array([2300, 2000, 1200, 1500, 3000], dtype=int64)

#### `Series Using read_csv`

In [3]:
# one column
subs = pd.read_csv('data\\subs.csv')
type(subs)

pandas.core.frame.DataFrame

In [None]:
subs = subs.squeeze()
type(subs)

pandas.core.series.Series

In [22]:
runs = pd.read_csv('data\\kohli_ipl.csv', index_col = 'match_no')

In [24]:
runs = runs.squeeze()
runs

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [12]:
movies = pd.read_csv('data\\bollywood.csv', index_col = 'movie').squeeze()
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

#### `Series Methods`

In [None]:
# head & tail
# first 5 items
subs.head() #type: ignore

0    48
1    57
2    40
3    43
4    44
Name: Subscribers gained, dtype: int64

In [None]:
subs.head(10) # type: ignore

0    48
1    57
2    40
3    43
4    44
5    46
6    33
7    40
8    44
9    74
Name: Subscribers gained, dtype: int64

In [None]:
# last 5 items
subs.tail() #type:ignore

360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, dtype: int64

In [16]:
subs.tail(10) #type:ignore

355    149
356    156
357    177
358    210
359    209
360    231
361    226
362    155
363    144
364    172
Name: Subscribers gained, dtype: int64

In [None]:
#sample
movies.sample() #type: ignore

movie
Rough Book    Tannishtha Chatterjee
Name: lead, dtype: object

In [17]:
movies.sample(10) #type: ignore

movie
Maan Gaye Mughal-e-Azam      Mallika Sherawat
Phata Poster Nikhla Hero        Shahid Kapoor
Kaalo                       Aditya Srivastava
The Last Lear                Amitabh Bachchan
Band Baaja Baaraat              Ranveer Singh
Chillar Party                    Aarav Khanna
Udta Punjab                     Shahid Kapoor
Waarrior Savitri             Niharica Raizada
Ek Se Bure Do                           Anita
Pihu                              Rahul Bagga
Name: lead, dtype: object

In [20]:
# value_counts (finds out how many times a value occured in the data)
movies.value_counts()

lead
Akshay Kumar        48
Amitabh Bachchan    45
Ajay Devgn          38
Salman Khan         31
Sanjay Dutt         26
                    ..
Diganth              1
Parveen Kaur         1
Seema Azmi           1
Akanksha Puri        1
Edwin Fernandes      1
Name: count, Length: 566, dtype: int64

In [None]:
# sort_values --> tamporary changes (inplace = False)
runs.sort_values()

match_no
87       0
211      0
207      0
206      0
91       0
      ... 
164    100
120    100
123    108
126    109
128    113
Name: runs, Length: 215, dtype: int64

In [31]:
#method chaining
runs.sort_values(ascending = False).head(1).values[0] # gives the highest run value

113

In [32]:
runs

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [34]:
runs = runs.copy()

In [None]:
runs.sort_values(inplace = True) #permanant changes

In [37]:
runs

match_no
87       0
211      0
207      0
206      0
91       0
      ... 
164    100
120    100
123    108
126    109
128    113
Name: runs, Length: 215, dtype: int64

In [38]:
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

In [None]:
# sort_index
movies.sort_index()

movie
1920 (film)                   Rajniesh Duggall
1920: London                     Sharman Joshi
1920: The Evil Returns             Vicky Ahuja
1971 (2007 film)                Manoj Bajpayee
2 States (2014 film)              Arjun Kapoor
                                   ...        
Zindagi 50-50                      Veena Malik
Zindagi Na Milegi Dobara        Hrithik Roshan
Zindagi Tere Naam           Mithun Chakraborty
Zokkomon                       Darsheel Safary
Zor Lagaa Ke...Haiya!            Meghan Jadhav
Name: lead, Length: 1500, dtype: object

#### `Series Maths Methods`

In [43]:
# count (counts all the Not-Null values)
runs.count()

215

In [45]:
# sum (sum of all the elements)
# product (product of all the elements)
print(subs.sum())
print(subs.product())

49510
0


In [56]:
# mean --> median --> mode --> std --> var

print(subs.mean())
print(subs.median())
print(movies.mode()) # most frequent item
print(subs.std())
print(subs.var())

135.64383561643837
123.0
0    Akshay Kumar
Name: lead, dtype: object
62.67502303725269
3928.1585127201556


In [58]:
# min/max
print(subs.min())
print(subs.max())

33
396


In [69]:
runs.describe()

count    215.000000
mean      30.855814
std       26.229801
min        0.000000
25%        9.000000
50%       24.000000
75%       48.000000
max      113.000000
Name: runs, dtype: float64

#### `Series Indexing & Slicing`

In [93]:
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

In [None]:
# label based (not Index)
movies['1920 (film)'] 

'Rajniesh Duggall'

In [None]:
# can take a list of labels
movies[['1920 (film)', '1920: London']]

movie
1920 (film)     Rajniesh Duggall
1920: London       Sharman Joshi
Name: lead, dtype: object

In [None]:
# .loc[] (strictly label based)
movies.loc['Zokkomon'] 

'Darsheel Safary'

In [95]:
# can take a list of labels
movies.loc[['Zokkomon', 'Zindagi 50-50', 'Battalion 609']] 

movie
Zokkomon         Darsheel Safary
Zindagi 50-50        Veena Malik
Battalion 609        Vicky Ahuja
Name: lead, dtype: object

In [None]:
# support boolean masks
subs.loc[subs > 300]

168    306
233    301
330    396
331    312
Name: Subscribers gained, dtype: int64

In [108]:
# slicing using .loc[]
runs.loc[::2]

match_no
1       1
3      13
5       1
7      34
9      21
       ..
207     0
209    58
211     0
213    73
215     7
Name: runs, Length: 108, dtype: int64

In [109]:
runs.loc[200:205]

match_no
200    41
201    12
202     5
203    48
204     1
205    12
Name: runs, dtype: int64

In [110]:
# .iloc[] (position based indexing)
# single integer

movies.iloc[5]


'Geetika Vidya Ohlyan'

In [113]:
#list of positions
movies.iloc[[23,45,67,100]]

movie
Notebook (2019 film)        Zaheer Iqbal
Chicken Curry Law       Natalia Janoszek
Jhalki                       Boman Irani
October (2018 film)         Varun Dhawan
Name: lead, dtype: object

In [114]:
# positional slicing
movies.iloc[30:37]

movie
Student of the Year 2          Tiger Shroff
PM Narendra Modi                Boman Irani
De De Pyaar De                   Ajay Devgn
India's Most Wanted (film)     Arjun Kapoor
Yeh Hai India                  Gavie Chahal
Khamoshi (2019 film)            Prabhu Deva
Kabir Singh                   Shahid Kapoor
Name: lead, dtype: object

In [119]:
# boolean masks
movies.iloc[movies.values == 'Vicky Kaushal'] 

movie
Uri: The Surgical Strike    Vicky Kaushal
Love per Square Foot        Vicky Kaushal
Name: lead, dtype: object

In [120]:
# .at[]
# fatest accessor for single item lookup with label
movies.at["Love per Square Foot"]

'Vicky Kaushal'

In [121]:
# .iat[]
# # fatest accessor for single item lookup with position
movies.iat[3]


'Emraan Hashmi'