In [2]:
import numpy as np
import pandas as pd

### Creating DataFrame

In [3]:
# using lists
# pd.DataFrame() - Creates a DataFrame from a 2D list structure
# columns parameter - Assigns column names to the DataFrame
# This method creates a DataFrame where each inner list becomes a row
student_data = [
    [100,80,10],
    [90,70,7],
    [120,100,14],
    [80,50,2]
]

pd.DataFrame(student_data,columns=['iq','marks','package'])

Unnamed: 0,iq,marks,package
0,100,80,10
1,90,70,7
2,120,100,14
3,80,50,2


In [4]:
# using dicts
# pd.DataFrame(dict) - Creates DataFrame from dictionary where keys become columns
# set_index() - Sets a column as the index (row labels) instead of default numeric index
# inplace=True - Modifies the original DataFrame instead of returning a new one

student_dict = {
    'name':['nitish','ankit','rupesh','rishabh','amit','ankita'],
    'iq':[100,90,120,80,0,0],
    'marks':[80,70,100,50,0,0],
    'package':[10,7,14,2,0,0]
}

students = pd.DataFrame(student_dict)
students.set_index('name',inplace=True)  # Makes 'name' column the index
students

Unnamed: 0_level_0,iq,marks,package
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
ankit,90,70,7
rupesh,120,100,14
rishabh,80,50,2
amit,0,0,0
ankita,0,0,0


In [5]:
# using read_csv
# pd.read_csv() - Reads comma-separated values (CSV) file into a DataFrame
# Automatically detects column names from first row and infers data types
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India)
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,Biography|Drama,6.1,5549,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA)
3,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,Crime|Drama,6.0,1891,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA)
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1624,Tera Mera Saath Rahen,tt0301250,https://upload.wikimedia.org/wikipedia/en/2/2b...,https://en.wikipedia.org/wiki/Tera_Mera_Saath_...,Tera Mera Saath Rahen,Tera Mera Saath Rahen,0,2001,148,Drama,4.9,278,Raj Dixit lives with his younger brother Rahu...,A man is torn between his handicapped brother ...,,Ajay Devgn|Sonali Bendre|Namrata Shirodkar|Pre...,,7 November 2001 (India)
1625,Yeh Zindagi Ka Safar,tt0298607,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Yeh_Zindagi_Ka_S...,Yeh Zindagi Ka Safar,Yeh Zindagi Ka Safar,0,2001,146,Drama,3.0,133,Hindi pop-star Sarina Devan lives a wealthy ...,A singer finds out she was adopted when the ed...,,Ameesha Patel|Jimmy Sheirgill|Nafisa Ali|Gulsh...,,16 November 2001 (India)
1626,Sabse Bada Sukh,tt0069204,,https://en.wikipedia.org/wiki/Sabse_Bada_Sukh,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,Comedy|Drama,6.1,13,Village born Lalloo re-locates to Bombay and ...,Village born Lalloo re-locates to Bombay and ...,,Vijay Arora|Asrani|Rajni Bala|Kumud Damle|Utpa...,,
1627,Daaka,tt10833860,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Daaka,Daaka,Daaka,0,2019,136,Action,7.4,38,Shinda tries robbing a bank so he can be wealt...,Shinda tries robbing a bank so he can be wealt...,,Gippy Grewal|Zareen Khan|,,1 November 2019 (USA)


In [6]:
# pd.read_csv() - Reading IPL matches data from CSV file
ipl = pd.read_csv('ipl-matches.csv')
ipl

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,335986,Kolkata,2008-04-20,2007/08,4,Kolkata Knight Riders,Deccan Chargers,Eden Gardens,Deccan Chargers,bat,N,Kolkata Knight Riders,Wickets,5.0,,DJ Hussey,"['WP Saha', 'BB McCullum', 'RT Ponting', 'SC G...","['AC Gilchrist', 'Y Venugopal Rao', 'VVS Laxma...",BF Bowden,K Hariharan
946,335985,Mumbai,2008-04-20,2007/08,5,Mumbai Indians,Royal Challengers Bangalore,Wankhede Stadium,Mumbai Indians,bat,N,Royal Challengers Bangalore,Wickets,5.0,,MV Boucher,"['L Ronchi', 'ST Jayasuriya', 'DJ Thornely', '...","['S Chanderpaul', 'R Dravid', 'LRPL Taylor', '...",SJ Davis,DJ Harper
947,335984,Delhi,2008-04-19,2007/08,3,Delhi Daredevils,Rajasthan Royals,Feroz Shah Kotla,Rajasthan Royals,bat,N,Delhi Daredevils,Wickets,9.0,,MF Maharoof,"['G Gambhir', 'V Sehwag', 'S Dhawan', 'MK Tiwa...","['T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif'...",Aleem Dar,GA Pratapkumar
948,335983,Chandigarh,2008-04-19,2007/08,2,Kings XI Punjab,Chennai Super Kings,"Punjab Cricket Association Stadium, Mohali",Chennai Super Kings,bat,N,Chennai Super Kings,Runs,33.0,,MEK Hussey,"['K Goel', 'JR Hopes', 'KC Sangakkara', 'Yuvra...","['PA Patel', 'ML Hayden', 'MEK Hussey', 'MS Dh...",MR Benson,SL Shastri


### DataFrame Attributes and Methods

In [7]:
# shape - Returns tuple (rows, columns) showing DataFrame dimensions
# Useful for understanding the size of your dataset
movies.shape  # Returns (number_of_rows, number_of_columns)
ipl.shape

(950, 20)

In [8]:
# dtypes - Shows data type of each column (int64, float64, object, etc.)
# Helps identify if columns need type conversion for analysis
movies.dtypes  # Returns Series with column names and their data types
ipl.dtypes

ID                   int64
City                object
Date                object
Season              object
MatchNumber         object
Team1               object
Team2               object
Venue               object
TossWinner          object
TossDecision        object
SuperOver           object
WinningTeam         object
WonBy               object
Margin             float64
method              object
Player_of_Match     object
Team1Players        object
Team2Players        object
Umpire1             object
Umpire2             object
dtype: object

In [9]:
# index - Shows the row labels/index of the DataFrame
# Default is RangeIndex(0, 1, 2, ...) but can be custom labels
movies.index  # Returns Index object with row labels
ipl.index

RangeIndex(start=0, stop=950, step=1)

In [10]:
# columns - Shows all column names in the DataFrame
# Returns Index object with column labels
movies.columns  # Returns Index object with column names
ipl.columns
students.columns  # Note: 'student' should be 'students'

Index(['iq', 'marks', 'package'], dtype='object')

In [11]:
# values - Returns the underlying numpy array of the DataFrame
# Strips away index and column labels, leaving just raw data
students.values  # Note: 'student' should be 'students'
ipl.values

array([[1312200, 'Ahmedabad', '2022-05-29', ...,
        "['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pandya', 'DA Miller', 'R Tewatia', 'Rashid Khan', 'R Sai Kishore', 'LH Ferguson', 'Yash Dayal', 'Mohammed Shami']",
        'CB Gaffaney', 'Nitin Menon'],
       [1312199, 'Ahmedabad', '2022-05-27', ...,
        "['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D Padikkal', 'SO Hetmyer', 'R Parag', 'R Ashwin', 'TA Boult', 'YS Chahal', 'M Prasidh Krishna', 'OC McCoy']",
        'CB Gaffaney', 'Nitin Menon'],
       [1312198, 'Kolkata', '2022-05-25', ...,
        "['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda', 'MP Stoinis', 'E Lewis', 'KH Pandya', 'PVD Chameera', 'Mohsin Khan', 'Avesh Khan', 'Ravi Bishnoi']",
        'J Madanagopal', 'MA Gough'],
       ...,
       [335984, 'Delhi', '2008-04-19', ...,
        "['T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif', 'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne', 'SK Trivedi', 'MM Patel']",
        'Aleem Dar', 'GA Pratapkumar'],
    

In [12]:
# head() - Returns first n rows of DataFrame (default n=5)
# Useful for quick preview of data structure and first few records
movies.head(2)  # Returns first 2 rows

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India)


In [13]:
# tail() - Returns last n rows of DataFrame (default n=5)
# Useful to see the end of your dataset
ipl.tail(2)  # Returns last 2 rows

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
948,335983,Chandigarh,2008-04-19,2007/08,2,Kings XI Punjab,Chennai Super Kings,"Punjab Cricket Association Stadium, Mohali",Chennai Super Kings,bat,N,Chennai Super Kings,Runs,33.0,,MEK Hussey,"['K Goel', 'JR Hopes', 'KC Sangakkara', 'Yuvra...","['PA Patel', 'ML Hayden', 'MEK Hussey', 'MS Dh...",MR Benson,SL Shastri
949,335982,Bangalore,2008-04-18,2007/08,1,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,N,Kolkata Knight Riders,Runs,140.0,,BB McCullum,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D...",Asad Rauf,RE Koertzen


In [14]:
# sample() - Returns random sample of n rows from DataFrame
# Useful for getting a random subset of data for analysis or testing
ipl.sample(5)  # Returns 5 randomly selected rows

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
428,980909,Kolkata,2016-04-13,2016,5,Kolkata Knight Riders,Mumbai Indians,Eden Gardens,Mumbai Indians,field,N,Mumbai Indians,Wickets,6.0,,RG Sharma,"['RV Uthappa', 'G Gambhir', 'MK Pandey', 'AD R...","['RG Sharma', 'PA Patel', 'HH Pandya', 'MJ McC...",Nitin Menon,S Ravi
894,336038,Mumbai,2008-05-30,2007/08,Semi Final,Delhi Daredevils,Rajasthan Royals,Wankhede Stadium,Delhi Daredevils,field,N,Rajasthan Royals,Runs,105.0,,SR Watson,"['G Gambhir', 'V Sehwag', 'S Dhawan', 'MK Tiwa...","['GC Smith', 'SA Asnodkar', 'Sohail Tanvir', '...",BF Bowden,RE Koertzen
519,733995,Mumbai,2014-05-10,2014,33,Mumbai Indians,Chennai Super Kings,Wankhede Stadium,Chennai Super Kings,field,N,Chennai Super Kings,Wickets,4.0,,DR Smith,"['LMP Simmons', 'CM Gautam', 'AT Rayudu', 'RG ...","['DR Smith', 'BB McCullum', 'SK Raina', 'F du ...",HDPK Dharmasena,VA Kulkarni
635,548374,Dharamsala,2012-05-19,2012,69,Kings XI Punjab,Delhi Daredevils,Himachal Pradesh Cricket Association Stadium,Delhi Daredevils,field,N,Delhi Daredevils,Wickets,6.0,,UT Yadav,"['AC Gilchrist', 'Mandeep Singh', 'PC Valthaty...","['UBT Chand', 'DA Warner', 'Y Venugopal Rao', ...",BF Bowden,VA Kulkarni
793,419147,Nagpur,2010-04-10,2009/10,42,Deccan Chargers,Chennai Super Kings,"Vidarbha Cricket Association Stadium, Jamtha",Chennai Super Kings,bat,N,Deccan Chargers,Wickets,6.0,,RJ Harris,"['MD Mishra', 'AC Gilchrist', 'TL Suman', 'RG ...","['M Vijay', 'ML Hayden', 'SK Raina', 'MS Dhoni...",HDPK Dharmasena,SJA Taufel


In [15]:
# info() - Provides concise summary of DataFrame including:
# - Number of rows and columns
# - Column names, data types, and non-null counts
# - Memory usage
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629 entries, 0 to 1628
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title_x           1629 non-null   object 
 1   imdb_id           1629 non-null   object 
 2   poster_path       1526 non-null   object 
 3   wiki_link         1629 non-null   object 
 4   title_y           1629 non-null   object 
 5   original_title    1629 non-null   object 
 6   is_adult          1629 non-null   int64  
 7   year_of_release   1629 non-null   int64  
 8   runtime           1629 non-null   object 
 9   genres            1629 non-null   object 
 10  imdb_rating       1629 non-null   float64
 11  imdb_votes        1629 non-null   int64  
 12  story             1609 non-null   object 
 13  summary           1629 non-null   object 
 14  tagline           557 non-null    object 
 15  actors            1624 non-null   object 
 16  wins_nominations  707 non-null    object 


In [16]:
# info() - Getting comprehensive information about IPL dataset
ipl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               950 non-null    int64  
 1   City             899 non-null    object 
 2   Date             950 non-null    object 
 3   Season           950 non-null    object 
 4   MatchNumber      950 non-null    object 
 5   Team1            950 non-null    object 
 6   Team2            950 non-null    object 
 7   Venue            950 non-null    object 
 8   TossWinner       950 non-null    object 
 9   TossDecision     950 non-null    object 
 10  SuperOver        946 non-null    object 
 11  WinningTeam      946 non-null    object 
 12  WonBy            950 non-null    object 
 13  Margin           932 non-null    float64
 14  method           19 non-null     object 
 15  Player_of_Match  946 non-null    object 
 16  Team1Players     950 non-null    object 
 17  Team2Players    

In [17]:
# describe() - Generates descriptive statistics for numerical columns:
# - count, mean, std, min, 25%, 50%, 75%, max
# Provides quick statistical overview of your numerical data
movies.describe()

Unnamed: 0,is_adult,year_of_release,imdb_rating,imdb_votes
count,1629.0,1629.0,1629.0,1629.0
mean,0.0,2010.263966,5.557459,5384.263352
std,0.0,5.381542,1.567609,14552.103231
min,0.0,2001.0,0.0,0.0
25%,0.0,2005.0,4.4,233.0
50%,0.0,2011.0,5.6,1000.0
75%,0.0,2015.0,6.8,4287.0
max,0.0,2019.0,9.4,310481.0


In [18]:
# describe() - Statistical summary of numerical columns in IPL dataset
ipl.describe()

Unnamed: 0,ID,Margin
count,950.0,932.0
mean,830485.2,17.056867
std,337567.8,21.633109
min,335982.0,1.0
25%,501261.2,6.0
50%,829738.0,8.0
75%,1175372.0,19.0
max,1312200.0,146.0


In [19]:
# isnull() - Returns boolean DataFrame showing True for missing values (NaN)
# sum() - Counts True values (missing values) for each column
# This combination shows count of missing values per column
movies.isnull().sum()

title_x                0
imdb_id                0
poster_path          103
wiki_link              0
title_y                0
original_title         0
is_adult               0
year_of_release        0
runtime                0
genres                 0
imdb_rating            0
imdb_votes             0
story                 20
summary                0
tagline             1072
actors                 5
wins_nominations     922
release_date         107
dtype: int64

In [20]:
# duplicated() - Returns boolean Series showing True for duplicate rows
# sum() - Counts True values (duplicate rows)
# This shows total number of duplicate rows in the DataFrame
movies.duplicated().sum()

np.int64(0)

In [21]:
# duplicated().sum() - Counting duplicate rows in students DataFrame
students.duplicated().sum()

np.int64(1)

In [22]:
# Displaying the students DataFrame before renaming columns
students

Unnamed: 0_level_0,iq,marks,package
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
ankit,90,70,7
rupesh,120,100,14
rishabh,80,50,2
amit,0,0,0
ankita,0,0,0


In [23]:
# rename() - Changes column names using a dictionary mapping
# columns parameter - Specifies that we're renaming columns (not index)
# inplace=True - Modifies original DataFrame instead of returning new one
students.rename(columns={'marks':'percent','package':'lpa'},inplace=True)

In [63]:
students

Unnamed: 0_level_0,iq,percent,lpa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
ankit,90,70,7
rupesh,120,100,14
rishabh,80,50,2
amit,0,0,0
ankita,0,0,0


### Math Methods

In [24]:
# sum() - Calculates sum of values
# axis=0 - Sum along rows (column-wise sum), returns sum for each column
# axis=1 would sum along columns (row-wise sum)
students.sum(axis=0)

iq         390
percent    300
lpa         33
dtype: int64

In [25]:
# mean() - Calculates average of values
# axis=1 - Calculate mean along columns (row-wise mean)
# Returns average value for each row across all columns
students.mean(axis=1)

name
nitish     63.333333
ankit      55.666667
rupesh     78.000000
rishabh    44.000000
amit        0.000000
ankita      0.000000
dtype: float64

In [64]:
# var() - Calculates variance for each numerical column
# Measures how spread out the data points are from the mean
# Higher variance indicates more scattered data
students.var()
students.mean()

iq         65.0
percent    50.0
lpa         5.5
dtype: float64

### Selecting cols from a DataFrame

In [27]:
# Single column selection using bracket notation
# Returns a pandas Series (1-dimensional labeled array)
movies['title_x']

0                   Uri: The Surgical Strike
1                              Battalion 609
2       The Accidental Prime Minister (film)
3                            Why Cheat India
4                            Evening Shadows
                        ...                 
1624                   Tera Mera Saath Rahen
1625                    Yeh Zindagi Ka Safar
1626                         Sabse Bada Sukh
1627                                   Daaka
1628                                Humsafar
Name: title_x, Length: 1629, dtype: object

In [66]:
movies[['title_x', 'year_of_release', 'lead actor']]

Unnamed: 0,title_x,year_of_release,lead actor
11,Gully Boy,2019,Ranveer Singh
34,Yeh Hai India,2017,Gavie Chahal
37,Article 15 (film),2019,Ayushmann Khurrana
87,Aiyaary,2018,Sidharth Malhotra
96,Raid (2018 film),2018,Ajay Devgn
...,...,...,...
1600,Kasoor,2001,Divya Dutta
1601,Maya (2001 film),2001,Anant Nag
1607,Nayak (2001 Hindi film),2001,Anil Kapoor
1621,Tum Bin,2001,Priyanshu Chatterjee


In [28]:
# Single column selection - extracting venue information
ipl['Venue']

0                Narendra Modi Stadium, Ahmedabad
1                Narendra Modi Stadium, Ahmedabad
2                           Eden Gardens, Kolkata
3                           Eden Gardens, Kolkata
4                        Wankhede Stadium, Mumbai
                          ...                    
945                                  Eden Gardens
946                              Wankhede Stadium
947                              Feroz Shah Kotla
948    Punjab Cricket Association Stadium, Mohali
949                         M Chinnaswamy Stadium
Name: Venue, Length: 950, dtype: object

In [29]:
# Multiple column selection using list of column names
# Returns a DataFrame with only the specified columns
# Double brackets [[]] - outer brackets for indexing, inner for list
movies[['year_of_release','actors','title_x']]

Unnamed: 0,year_of_release,actors,title_x
0,2019,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,Uri: The Surgical Strike
1,2019,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,Battalion 609
2,2019,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,The Accidental Prime Minister (film)
3,2019,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,Why Cheat India
4,2018,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,Evening Shadows
...,...,...,...
1624,2001,Ajay Devgn|Sonali Bendre|Namrata Shirodkar|Pre...,Tera Mera Saath Rahen
1625,2001,Ameesha Patel|Jimmy Sheirgill|Nafisa Ali|Gulsh...,Yeh Zindagi Ka Safar
1626,2018,Vijay Arora|Asrani|Rajni Bala|Kumud Damle|Utpa...,Sabse Bada Sukh
1627,2019,Gippy Grewal|Zareen Khan|,Daaka


In [30]:
# Multiple column selection - team and winner information
ipl[['Team1','Team2','WinningTeam']]

Unnamed: 0,Team1,Team2,WinningTeam
0,Rajasthan Royals,Gujarat Titans,Gujarat Titans
1,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals
2,Royal Challengers Bangalore,Lucknow Super Giants,Royal Challengers Bangalore
3,Rajasthan Royals,Gujarat Titans,Gujarat Titans
4,Sunrisers Hyderabad,Punjab Kings,Punjab Kings
...,...,...,...
945,Kolkata Knight Riders,Deccan Chargers,Kolkata Knight Riders
946,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore
947,Delhi Daredevils,Rajasthan Royals,Delhi Daredevils
948,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings


### Selecting rows from a DataFrame

- **iloc** - searches using index positions
- **loc** - searches using index labels

In [31]:
# iloc - Integer location-based indexing
# iloc[5] - Selects row at index position 5 (6th row, 0-indexed)
# Returns a pandas Series with all column values for that row
movies.iloc[5]

title_x                                                   Soni (film)
imdb_id                                                     tt6078866
poster_path         https://upload.wikimedia.org/wikipedia/en/thum...
wiki_link                   https://en.wikipedia.org/wiki/Soni_(film)
title_y                                                          Soni
original_title                                                   Soni
is_adult                                                            0
year_of_release                                                  2018
runtime                                                            97
genres                                                          Drama
imdb_rating                                                       7.2
imdb_votes                                                       1595
story               Soni  a young policewoman in Delhi  and her su...
summary             While fighting crimes against women in Delhi  ...
tagline             

In [32]:
# iloc with slicing - [:5] selects rows from start to index 4 (5 rows total)
# Returns DataFrame with first 5 rows (positions 0, 1, 2, 3, 4)
movies.iloc[:5]

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India)
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,Biography|Drama,6.1,5549,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA)
3,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,Crime|Drama,6.0,1891,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA)
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India)


In [33]:
# Fancy indexing with iloc - using list of specific positions
# iloc[[0,4,5]] - Selects rows at positions 0, 4, and 5
# Non-consecutive row selection
movies.iloc[[0,4,5]]

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India)
5,Soni (film),tt6078866,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Soni_(film),Soni,Soni,0,2018,97,Drama,7.2,1595,Soni a young policewoman in Delhi and her su...,While fighting crimes against women in Delhi ...,,Geetika Vidya Ohlyan|Saloni Batra|Vikas Shukla...,3 wins & 5 nominations,18 January 2019 (USA)


In [34]:
# Displaying students DataFrame to see the custom index (names)
students

Unnamed: 0_level_0,iq,percent,lpa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
ankit,90,70,7
rupesh,120,100,14
rishabh,80,50,2
amit,0,0,0
ankita,0,0,0


In [35]:
# loc - Label-based indexing using index labels (not positions)
# loc['nitish'] - Selects row with index label 'nitish'
# Returns Series with all column values for that student
students.loc['nitish']

iq         100
percent     80
lpa         10
Name: nitish, dtype: int64

In [36]:
# loc with slicing and step - ['nitish':'rishabh':2]
# From 'nitish' to 'rishabh' with step of 2 (every 2nd row)
# Includes both start and end labels (unlike iloc)
students.loc['nitish':'rishabh':2]

Unnamed: 0_level_0,iq,percent,lpa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
rupesh,120,100,14


In [37]:
# loc with list of labels - fancy indexing by name
# Selects specific rows by their index labels (names)
# Non-consecutive selection based on labels
students.loc[['nitish','ankita','rupesh']]

Unnamed: 0_level_0,iq,percent,lpa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
ankita,0,0,0
rupesh,120,100,14


In [38]:
# iloc with list of positions - same students as above but using positions
# Positions 0, 3, 4 correspond to 'nitish', 'ankita', 'rupesh'
students.iloc[[0,3,4]]

Unnamed: 0_level_0,iq,percent,lpa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nitish,100,80,10
rishabh,80,50,2
amit,0,0,0


### Selecting both rows and cols

In [39]:
# iloc for both rows and columns - [row_slice, column_slice]
# [0:3, 0:3] - First 3 rows and first 3 columns
# Creates a subset DataFrame with specific rows and columns
movies.iloc[0:3,0:3]

Unnamed: 0,title_x,imdb_id,poster_path
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...
1,Battalion 609,tt9472208,
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...


In [40]:
# loc for both rows and columns using labels
# [0:2, 'title_x':'poster_path'] - Rows 0-2 and columns from 'title_x' to 'poster_path'
# Column slicing includes both start and end columns
movies.loc[0:2,'title_x':'poster_path']

Unnamed: 0,title_x,imdb_id,poster_path
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...
1,Battalion 609,tt9472208,
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...


### Filtering a DataFrame

In [41]:
# head(2) - Quick preview of IPL data before filtering examples
ipl.head(2)

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon


In [42]:
# Boolean filtering - Creating a mask for conditional selection
# mask = condition creates boolean Series (True/False for each row)
# ipl[mask] returns only rows where condition is True
# Method 1: Step by step
mask = ipl['MatchNumber'] == 'Final'
new_df = ipl[mask]
new_df[['Season','WinningTeam']]

# Method 2: One-liner (more common approach)
ipl[ipl['MatchNumber'] == 'Final'][['Season','WinningTeam']]

Unnamed: 0,Season,WinningTeam
0,2022,Gujarat Titans
74,2021,Chennai Super Kings
134,2020/21,Mumbai Indians
194,2019,Mumbai Indians
254,2018,Chennai Super Kings
314,2017,Mumbai Indians
373,2016,Sunrisers Hyderabad
433,2015,Mumbai Indians
492,2014,Kolkata Knight Riders
552,2013,Mumbai Indians


In [70]:
ipl

Unnamed: 0,ID,City,Date,Season,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,SuperOver,WinningTeam,WonBy,Margin,method,Player_of_Match,Team1Players,Team2Players,Umpire1,Umpire2
0,1312200,Ahmedabad,2022-05-29,2022,Final,Rajasthan Royals,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,bat,N,Gujarat Titans,Wickets,7.0,,HH Pandya,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",CB Gaffaney,Nitin Menon
1,1312199,Ahmedabad,2022-05-27,2022,Qualifier 2,Royal Challengers Bangalore,Rajasthan Royals,"Narendra Modi Stadium, Ahmedabad",Rajasthan Royals,field,N,Rajasthan Royals,Wickets,7.0,,JC Buttler,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...",CB Gaffaney,Nitin Menon
2,1312198,Kolkata,2022-05-25,2022,Eliminator,Royal Challengers Bangalore,Lucknow Super Giants,"Eden Gardens, Kolkata",Lucknow Super Giants,field,N,Royal Challengers Bangalore,Runs,14.0,,RM Patidar,"['V Kohli', 'F du Plessis', 'RM Patidar', 'GJ ...","['Q de Kock', 'KL Rahul', 'M Vohra', 'DJ Hooda...",J Madanagopal,MA Gough
3,1312197,Kolkata,2022-05-24,2022,Qualifier 1,Rajasthan Royals,Gujarat Titans,"Eden Gardens, Kolkata",Gujarat Titans,field,N,Gujarat Titans,Wickets,7.0,,DA Miller,"['YBK Jaiswal', 'JC Buttler', 'SV Samson', 'D ...","['WP Saha', 'Shubman Gill', 'MS Wade', 'HH Pan...",BNJ Oxenford,VK Sharma
4,1304116,Mumbai,2022-05-22,2022,70,Sunrisers Hyderabad,Punjab Kings,"Wankhede Stadium, Mumbai",Sunrisers Hyderabad,bat,N,Punjab Kings,Wickets,5.0,,Harpreet Brar,"['PK Garg', 'Abhishek Sharma', 'RA Tripathi', ...","['JM Bairstow', 'S Dhawan', 'M Shahrukh Khan',...",AK Chaudhary,NA Patwardhan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,335986,Kolkata,2008-04-20,2007/08,4,Kolkata Knight Riders,Deccan Chargers,Eden Gardens,Deccan Chargers,bat,N,Kolkata Knight Riders,Wickets,5.0,,DJ Hussey,"['WP Saha', 'BB McCullum', 'RT Ponting', 'SC G...","['AC Gilchrist', 'Y Venugopal Rao', 'VVS Laxma...",BF Bowden,K Hariharan
946,335985,Mumbai,2008-04-20,2007/08,5,Mumbai Indians,Royal Challengers Bangalore,Wankhede Stadium,Mumbai Indians,bat,N,Royal Challengers Bangalore,Wickets,5.0,,MV Boucher,"['L Ronchi', 'ST Jayasuriya', 'DJ Thornely', '...","['S Chanderpaul', 'R Dravid', 'LRPL Taylor', '...",SJ Davis,DJ Harper
947,335984,Delhi,2008-04-19,2007/08,3,Delhi Daredevils,Rajasthan Royals,Feroz Shah Kotla,Rajasthan Royals,bat,N,Delhi Daredevils,Wickets,9.0,,MF Maharoof,"['G Gambhir', 'V Sehwag', 'S Dhawan', 'MK Tiwa...","['T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif'...",Aleem Dar,GA Pratapkumar
948,335983,Chandigarh,2008-04-19,2007/08,2,Kings XI Punjab,Chennai Super Kings,"Punjab Cricket Association Stadium, Mohali",Chennai Super Kings,bat,N,Chennai Super Kings,Runs,33.0,,MEK Hussey,"['K Goel', 'JR Hopes', 'KC Sangakkara', 'Yuvra...","['PA Patel', 'ML Hayden', 'MEK Hussey', 'MS Dh...",MR Benson,SL Shastri


In [74]:
# Find all the final winners in IPL matches
# final_winners = ipl[ipl['MatchNumber'] == 'Final'][['Season', 'WinningTeam']]
# final_winners

final_winner = ipl['MatchNumber']=='Final'
final = ipl[final_winner]
final[['Season', 'WinningTeam','Venue']]


Unnamed: 0,Season,WinningTeam,Venue
0,2022,Gujarat Titans,"Narendra Modi Stadium, Ahmedabad"
74,2021,Chennai Super Kings,Dubai International Cricket Stadium
134,2020/21,Mumbai Indians,Dubai International Cricket Stadium
194,2019,Mumbai Indians,Rajiv Gandhi International Stadium
254,2018,Chennai Super Kings,Wankhede Stadium
314,2017,Mumbai Indians,"Rajiv Gandhi International Stadium, Uppal"
373,2016,Sunrisers Hyderabad,M Chinnaswamy Stadium
433,2015,Mumbai Indians,Eden Gardens
492,2014,Kolkata Knight Riders,M Chinnaswamy Stadium
552,2013,Mumbai Indians,Eden Gardens


In [75]:
ipl[ipl['MatchNumber']=='Final'][['Season','WinningTeam']]

Unnamed: 0,Season,WinningTeam
0,2022,Gujarat Titans
74,2021,Chennai Super Kings
134,2020/21,Mumbai Indians
194,2019,Mumbai Indians
254,2018,Chennai Super Kings
314,2017,Mumbai Indians
373,2016,Sunrisers Hyderabad
433,2015,Mumbai Indians
492,2014,Kolkata Knight Riders
552,2013,Mumbai Indians


In [43]:
# Boolean filtering + shape[0] to count matching rows
# ipl[condition] filters data, shape[0] gives number of rows
# Counts how many super over finishes occurred
ipl[ipl['SuperOver'] == 'Y'].shape[0] #[0] gives number of rows which is the count of matches with super over finishes

14

In [77]:
# How many matches had a super over finish?
ipl[ipl['SuperOver']=='Y'].shape

(14, 20)

In [44]:
# Multiple conditions with AND operator (&)
# Both conditions must be True: City='Kolkata' AND WinningTeam='Chennai Super Kings'
# Parentheses are required around each condition when using & or |
ipl[(ipl['City'] == 'Kolkata') & (ipl['WinningTeam'] == 'Chennai Super Kings')].shape[0]

5

In [80]:
# How many match has csk won in kolkata?
ipl[(ipl['City']=='Kolkata') & (ipl['WinningTeam']=='Chennai Super Kings')].shape[0]
#bitwise and as it is used for boolean operations

5

In [45]:
# Percentage calculation using filtered data
# Compares two columns: TossWinner == WinningTeam
# (matches_where_toss_winner_wins / total_matches) * 100
(ipl[ipl['TossWinner'] == ipl['WinningTeam']].shape[0]/ipl.shape[0])*100

51.473684210526315

In [83]:
# Toss winner is match winner in percentage

percentage = (ipl[ipl['TossWinner'] == ipl['WinningTeam']].shape[0] / ipl.shape[0]) * 100
percentage






51.473684210526315

## Detailed Breakdown: Toss Winner = Match Winner Percentage

This code calculates what percentage of IPL matches are won by the team that wins the toss. Let's break it down step by step:

### Step 1: Boolean Comparison
```python
ipl['TossWinner'] == ipl['WinningTeam']
```
- Compares each row where `TossWinner` column equals `WinningTeam` column
- Returns a boolean Series (True/False for each match)
- True = toss winner won the match, False = toss winner lost the match

### Step 2: Boolean Filtering
```python
ipl[ipl['TossWinner'] == ipl['WinningTeam']]
```
- Uses the boolean Series as a filter/mask
- Returns only rows where the condition is True
- This gives us a DataFrame with only matches where toss winner = match winner

### Step 3: Count Matching Rows
```python
ipl[ipl['TossWinner'] == ipl['WinningTeam']].shape[0]
```
- `.shape[0]` gets the number of rows in the filtered DataFrame
- This counts how many matches were won by the toss winner

### Step 4: Total Matches
```python
ipl.shape[0]
```
- Gets total number of matches in the dataset

### Step 5: Calculate Percentage
```python
(matches_won_by_toss_winner / total_matches) * 100
```
- Divides count by total and multiplies by 100 for percentage
- Final result shows what % of matches are won by toss winners

### Example Visualization:
If out of 1000 matches, 520 were won by toss winners:
- Numerator: 520 (matches won by toss winner)
- Denominator: 1000 (total matches)
- Percentage: (520/1000) * 100 = 52%

In [84]:
# Let's demonstrate each step with actual data

print("Step 1: Boolean comparison (first 10 results)")
boolean_mask = ipl['TossWinner'] == ipl['WinningTeam']
print(boolean_mask.head(10))
print(f"\nData type: {type(boolean_mask)}")

print("\nStep 2: Count of True values (matches won by toss winner)")
matches_won_by_toss_winner = boolean_mask.sum()  # sum() counts True values
print(f"Matches won by toss winner: {matches_won_by_toss_winner}")

print("\nStep 3: Total number of matches")
total_matches = len(ipl)
print(f"Total matches: {total_matches}")

print("\nStep 4: Calculate percentage")
percentage = (matches_won_by_toss_winner / total_matches) * 100
print(f"Percentage: {percentage:.2f}%")

print(f"\nConclusion: In {percentage:.1f}% of IPL matches, the team that wins the toss also wins the match.")

Step 1: Boolean comparison (first 10 results)
0    False
1     True
2    False
3     True
4    False
5     True
6    False
7    False
8     True
9    False
dtype: bool

Data type: <class 'pandas.core.series.Series'>

Step 2: Count of True values (matches won by toss winner)
Matches won by toss winner: 489

Step 3: Total number of matches
Total matches: 950

Step 4: Calculate percentage
Percentage: 51.47%

Conclusion: In 51.5% of IPL matches, the team that wins the toss also wins the match.


In [85]:
# Alternative approaches to the same calculation

# Method 1: Using value_counts() on boolean series
print("Method 1: Using value_counts()")
toss_match_comparison = (ipl['TossWinner'] == ipl['WinningTeam']).value_counts()
print(toss_match_comparison)
percentage_alt1 = (toss_match_comparison[True] / toss_match_comparison.sum()) * 100
print(f"Percentage (Method 1): {percentage_alt1:.2f}%")

print("\nMethod 2: Using mean() on boolean series")
# mean() on boolean series gives proportion of True values
percentage_alt2 = (ipl['TossWinner'] == ipl['WinningTeam']).mean() * 100
print(f"Percentage (Method 2): {percentage_alt2:.2f}%")

print("\nMethod 3: Using query() method")
matches_toss_winner_wins = ipl.query('TossWinner == WinningTeam').shape[0]
percentage_alt3 = (matches_toss_winner_wins / len(ipl)) * 100
print(f"Percentage (Method 3): {percentage_alt3:.2f}%")

print(f"\n🏏 Key Insight: The toss advantage is minimal!")
print(f"📊 At {percentage:.1f}%, winning the toss provides only a slight advantage")
print(f"📈 This suggests that team skill matters more than toss luck in IPL!")

Method 1: Using value_counts()
True     489
False    461
Name: count, dtype: int64
Percentage (Method 1): 51.47%

Method 2: Using mean() on boolean series
Percentage (Method 2): 51.47%

Method 3: Using query() method
Percentage (Method 3): 51.47%

🏏 Key Insight: The toss advantage is minimal!
📊 At 51.5%, winning the toss provides only a slight advantage
📈 This suggests that team skill matters more than toss luck in IPL!


In [46]:
# Multiple numerical conditions - rating > 8.5 AND votes > 10000
# Filters for high-quality movies with significant number of votes
movies[(movies['imdb_rating'] > 8.5) & (movies['imdb_votes'] > 10000)].shape[0]

0

In [None]:
# movies with rating higher than 8 and vote more than 10000
movies[(movies['imdb_rating'] > 8 ) & (movies['imdb_votes'] > 10000)].shape[0]


23

In [47]:
# String operations for filtering
# str.contains('Action') - Checks if 'Action' appears anywhere in the genres string
# Alternative: str.split('|').apply(lambda x:'Action' in x) for exact genre matching
mask1 = movies['genres'].str.contains('Action')
mask2 = movies['imdb_rating'] > 7.5

# Combining string and numerical conditions
movies[mask1 & mask2]

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA)
41,Family of Thakurganj,tt8897986,https://upload.wikimedia.org/wikipedia/en/9/99...,https://en.wikipedia.org/wiki/Family_of_Thakur...,Family of Thakurganj,Family of Thakurganj,0,2019,127,Action|Drama,9.4,895,The film is based on small town of North India...,The film is based on small town of North India...,,Jimmy Sheirgill|Mahie Gill|Nandish Singh|Prana...,,19 July 2019 (India)
84,Mukkabaaz,tt7180544,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Mukkabaaz,The Brawler,Mukkabaaz,0,2017,154,Action|Drama|Sport,8.1,5434,A boxer (Shravan) belonging to upper cast tra...,A boxer struggles to make his mark in the boxi...,,Viineet Kumar|Jimmy Sheirgill|Zoya Hussain|Rav...,3 wins & 6 nominations,12 January 2018 (USA)
106,Raazi,tt7098658,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Raazi,Raazi,Raazi,0,2018,138,Action|Drama|Thriller,7.8,20289,Hidayat Khan is the son of an Indian freedom f...,A Kashmiri woman agrees to marry a Pakistani a...,An incredible true story,Alia Bhatt|Vicky Kaushal|Rajit Kapoor|Shishir ...,21 wins & 26 nominations,11 May 2018 (USA)
110,Parmanu: The Story of Pokhran,tt6826438,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Parmanu:_The_Sto...,Parmanu: The Story of Pokhran,Parmanu: The Story of Pokhran,0,2018,129,Action|Drama|History,7.7,18292,Captain Ashwat Raina's efforts to turn India i...,Ashwat Raina and his teammates arrive in Pokhr...,1998| India: one secret operation| six Indians...,John Abraham|Boman Irani|Diana Penty|Anuja Sat...,,25 May 2018 (USA)
112,Bhavesh Joshi Superhero,tt6129302,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Bhavesh_Joshi_Su...,Bhavesh Joshi Superhero,Bhavesh Joshi Superhero,0,2018,154,Action|Drama,7.6,4928,Bhavesh Joshi Superhero is an action film abou...,The origin story of Bhavesh Joshi an Indian s...,This year| justice will have a new name.,Harshvardhan Kapoor|Priyanshu Painyuli|Ashish ...,2 nominations,1 June 2018 (USA)
169,The Ghazi Attack,tt6299040,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Ghazi_Attack...,The Ghazi Attack,The Ghazi Attack,0,2017,116,Action|Thriller|War,7.6,10332,In 1971 amid rising tensions between India an...,A Pakistani submarine Ghazi plans to secretly...,The war you did not know about,Rana Daggubati|Kay Kay Menon|Atul Kulkarni|Om ...,1 win & 7 nominations,17 February 2017 (USA)
219,Raag Desh (film),tt6080746,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Raagdesh,Raag Desh,Raag Desh,0,2017,135,Action|Drama|History,8.3,341,A period film based on the historic 1945 India...,A period film based on the historic 1945 India...,,Kunal Kapoor|Amit Sadh|Mohit Marwah|Kenneth De...,,28 July 2017 (India)
258,Irudhi Suttru,tt5310090,https://upload.wikimedia.org/wikipedia/en/f/fe...,https://en.wikipedia.org/wiki/Saala_Khadoos,Saala Khadoos,Saala Khadoos,0,2016,109,Action|Drama|Sport,7.6,10507,An under-fire boxing coach Prabhu is transfer...,The story of a former boxer who quits boxing f...,,Madhavan|Ritika Singh|Mumtaz Sorcar|Nassar|Rad...,9 wins & 2 nominations,29 January 2016 (USA)
280,Laal Rang,tt5600714,,https://en.wikipedia.org/wiki/Laal_Rang,Laal Rang,Laal Rang,0,2016,147,Action|Crime|Drama,8.0,3741,The friendship of two men is tested when thing...,The friendship of two men is tested when thing...,Every job good or bad| must be done with honesty.,Randeep Hooda|Akshay Oberoi|Rajniesh Duggall|P...,,22 April 2016 (India)


In [90]:
# Action movies with higher than 7.5 rating
mask1 = movies['genres'].str.split('|').apply(lambda x: 'Action' in x)
mask2 = movies['imdb_rating'] > 7.5
movies[mask1 & mask2]

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date,Country,lead actor
106,Raazi,tt7098658,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Raazi,Raazi,Raazi,0,2018,138,Action|Drama|Thriller,7.8,20289,Hidayat Khan is the son of an Indian freedom f...,A Kashmiri woman agrees to marry a Pakistani a...,An incredible true story,Alia Bhatt|Vicky Kaushal|Rajit Kapoor|Shishir ...,21 wins & 26 nominations,11 May 2018 (USA),India,Alia Bhatt
112,Bhavesh Joshi Superhero,tt6129302,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Bhavesh_Joshi_Su...,Bhavesh Joshi Superhero,Bhavesh Joshi Superhero,0,2018,154,Action|Drama,7.6,4928,Bhavesh Joshi Superhero is an action film abou...,The origin story of Bhavesh Joshi an Indian s...,This year| justice will have a new name.,Harshvardhan Kapoor|Priyanshu Painyuli|Ashish ...,2 nominations,1 June 2018 (USA),India,Harshvardhan Kapoor
169,The Ghazi Attack,tt6299040,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Ghazi_Attack...,The Ghazi Attack,The Ghazi Attack,0,2017,116,Action|Thriller|War,7.6,10332,In 1971 amid rising tensions between India an...,A Pakistani submarine Ghazi plans to secretly...,The war you did not know about,Rana Daggubati|Kay Kay Menon|Atul Kulkarni|Om ...,1 win & 7 nominations,17 February 2017 (USA),India,Rana Daggubati
354,Dangal (film),tt5074352,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Dangal_(film),Dangal,Dangal,0,2016,161,Action|Biography|Drama,8.4,131338,Biopic of Mahavir Singh Phogat who taught wre...,Former wrestler Mahavir Singh Phogat and his t...,You think our girls are any lesser than boys?,Aamir Khan|Fatima Sana Shaikh|Sanya Malhotra|S...,23 wins & 4 nominations,21 December 2016 (USA),India,Aamir Khan
365,Baby (2015 Hindi film),tt3848892,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Baby_(2015_Hindi...,Baby,Baby,0,2015,159,Action|Thriller,8.0,49426,The country is perpetually under threat from t...,An elite counter-intelligence unit learns of a...,History Is Made By Those Who Give A Damn!,Akshay Kumar|Danny Denzongpa|Rana Daggubati|Ta...,1 win,23 January 2015 (India),India,Akshay Kumar
449,Titli (2014 film),tt3019620,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Titli_(2014_film),Titli,Titli,0,2014,116,Action|Drama|Thriller,7.6,3677,In the badlands of Delhi's dystopic underbelly...,A Hindi feature film set in the lower depths o...,Daring| Desireable| Dangerous,Nawazuddin Siddiqui|Niharika Singh|Anil George...,4 wins & 5 nominations,20 June 2014 (USA),India,Nawazuddin Siddiqui
1039,1971 (2007 film),tt0983990,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/1971_(2007_film),1971,1971,0,2007,160,Action|Drama|War,7.9,1121,Based on true facts the film revolves around ...,Based on true facts the film revolves around ...,Honor the heroes.......,Manoj Bajpayee|Ravi Kishan|Deepak Dobriyal|,1 win,9 March 2007 (India),India,Manoj Bajpayee
1058,Black Friday (2007 film),tt0400234,https://upload.wikimedia.org/wikipedia/en/5/58...,https://en.wikipedia.org/wiki/Black_Friday_(20...,Black Friday,Black Friday,0,2004,143,Action|Crime|Drama,8.5,16761,A dramatic presentation of the bomb blasts tha...,Black Friday is a film about the investigation...,The story of the Bombay bomb blasts,Kay Kay Menon|Pavan Malhotra|Aditya Srivastava...,3 nominations,9 February 2007 (India),India,Kay Kay Menon
1293,Sarkar (2005 film),tt0432047,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Sarkar_(2005_film),Sarkar,Sarkar,0,2005,124,Action|Crime|Drama,7.6,14694,Meet Subhash Nagre - a wealthy and influential...,The authority of a man who runs a parallel go...,'There are no Rights and Wrongs. Only Power' -...,Amitabh Bachchan|Abhishek Bachchan|Kay Kay Men...,2 wins & 10 nominations,1 July 2005 (India),India,Amitabh Bachchan
1361,Lakshya (film),tt0323013,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Lakshya_(film),Lakshya,Lakshya,0,2004,186,Action|Drama|Romance,7.9,18777,Karan is a lazy good-for-nothing who lives on ...,An aimless jobless irresponsible grown man j...,It took him 24 years and 18000 feet to find hi...,Hrithik Roshan|Preity Zinta|Amitabh Bachchan|O...,4 wins & 10 nominations,18 June 2004 (USA),India,Hrithik Roshan


## Detailed Breakdown: Action Movies with Rating > 7.5

This code filters movies to find Action movies with high ratings (> 7.5). Let's break it down step by step:

### Understanding the Data Structure
The `genres` column contains multiple genres separated by pipe (`|`) symbols:
- Example: "Action|Adventure|Sci-Fi" or "Drama|Romance"

### Step-by-Step Analysis:

#### Step 1: String Splitting and Genre Detection
```python
mask1 = movies['genres'].str.split('|').apply(lambda x: 'Action' in x)
```

**What happens here:**
1. `str.split('|')` - Splits each genre string into a list
   - "Action|Adventure|Sci-Fi" → ['Action', 'Adventure', 'Sci-Fi']
   - "Drama|Romance" → ['Drama', 'Romance']

2. `apply(lambda x: 'Action' in x)` - Checks if 'Action' exists in the list
   - ['Action', 'Adventure', 'Sci-Fi'] → True (Action found)
   - ['Drama', 'Romance'] → False (Action not found)

3. **Result**: Boolean Series (True/False for each movie)

#### Step 2: Rating Filter
```python
mask2 = movies['imdb_rating'] > 7.5
```
- Simple numerical comparison
- Returns True for movies with rating > 7.5, False otherwise

#### Step 3: Combining Conditions
```python
movies[mask1 & mask2]
```
- `&` is the bitwise AND operator for pandas
- Both conditions must be True:
  - Movie must be an Action movie (mask1 = True)
  - Movie must have rating > 7.5 (mask2 = True)
- Returns DataFrame with only movies meeting both criteria

In [91]:
# Let's demonstrate each step with sample data

print("Step 1: Understanding the genres column structure")
print("Sample genres data:")
print(movies['genres'].head())
print(f"\nGenres data type: {movies['genres'].dtype}")

print("\n" + "="*60)
print("Step 2: String splitting demonstration")
# Show what happens when we split genres
sample_genres = movies['genres'].head(3)
print("Original genres:")
for i, genre in enumerate(sample_genres):
    print(f"Row {i}: {genre}")

print("\nAfter splitting:")
split_genres = sample_genres.str.split('|')
for i, genre_list in enumerate(split_genres):
    print(f"Row {i}: {genre_list}")

print("\n" + "="*60)
print("Step 3: Action detection demonstration")
action_check = split_genres.apply(lambda x: 'Action' in x)
print("Action movie check:")
for i, (original, split, is_action) in enumerate(zip(sample_genres, split_genres, action_check)):
    print(f"Row {i}: '{original}' → {split} → Action present: {is_action}")

print("\n" + "="*60)
print("Step 4: Complete filtering process")
# Create masks step by step
mask1 = movies['genres'].str.split('|').apply(lambda x: 'Action' in x)
mask2 = movies['imdb_rating'] > 7.5

print(f"Total movies: {len(movies)}")
print(f"Action movies: {mask1.sum()}")
print(f"Movies with rating > 7.5: {mask2.sum()}")
print(f"Action movies with rating > 7.5: {(mask1 & mask2).sum()}")

print("\n" + "="*60)
print("Step 5: Final result preview")
action_high_rated = movies[mask1 & mask2]
print(f"Found {len(action_high_rated)} Action movies with rating > 7.5")
if len(action_high_rated) > 0:
    print("\nSample results:")
    print(action_high_rated[['title_x', 'genres', 'imdb_rating']].head())

Step 1: Understanding the genres column structure
Sample genres data:
11               Drama|Music
34    Action|Adventure|Drama
37               Crime|Drama
87           Action|Thriller
96        Action|Crime|Drama
Name: genres, dtype: object

Genres data type: object

Step 2: String splitting demonstration
Original genres:
Row 0: Drama|Music
Row 1: Action|Adventure|Drama
Row 2: Crime|Drama

After splitting:
Row 0: ['Drama', 'Music']
Row 1: ['Action', 'Adventure', 'Drama']
Row 2: ['Crime', 'Drama']

Step 3: Action detection demonstration
Action movie check:
Row 0: 'Drama|Music' → ['Drama', 'Music'] → Action present: False
Row 1: 'Action|Adventure|Drama' → ['Action', 'Adventure', 'Drama'] → Action present: True
Row 2: 'Crime|Drama' → ['Crime', 'Drama'] → Action present: False

Step 4: Complete filtering process
Total movies: 298
Action movies: 103
Movies with rating > 7.5: 62
Action movies with rating > 7.5: 12

Step 5: Final result preview
Found 12 Action movies with rating > 7.5

Samp

In [92]:
# Alternative approaches to the same filtering task

print("🎬 ALTERNATIVE METHODS FOR GENRE FILTERING")
print("="*60)

# Method 1: Using str.contains() (less precise)
print("Method 1: Using str.contains() - LESS PRECISE")
mask1_alt = movies['genres'].str.contains('Action', na=False)
mask2 = movies['imdb_rating'] > 7.5
result1 = movies[mask1_alt & mask2]
print(f"Results using str.contains(): {len(result1)} movies")
print("⚠️  Warning: This might match 'Action' within other words like 'Interaction'")

print("\nMethod 2: Using str.split() + apply() - MORE PRECISE (Original)")
mask1_orig = movies['genres'].str.split('|').apply(lambda x: 'Action' in x)
result2 = movies[mask1_orig & mask2]
print(f"Results using split + apply: {len(result2)} movies")
print("✅ This ensures exact genre matching")

print("\nMethod 3: Using str.split() + explode() + isin() - ADVANCED")
# This creates a more complex but flexible approach
movies_exploded = movies.assign(genre_split=movies['genres'].str.split('|')).explode('genre_split')
action_movies_ids = movies_exploded[movies_exploded['genre_split'] == 'Action'].index.unique()
mask1_explode = movies.index.isin(action_movies_ids)
result3 = movies[mask1_explode & mask2]
print(f"Results using explode method: {len(result3)} movies")

print("\n" + "="*60)
print("🎯 KEY INSIGHTS:")
print(f"📊 Out of {len(movies)} total movies:")
print(f"   • {mask1_orig.sum()} are Action movies ({mask1_orig.mean()*100:.1f}%)")
print(f"   • {mask2.sum()} have rating > 7.5 ({mask2.mean()*100:.1f}%)")
print(f"   • {(mask1_orig & mask2).sum()} are high-rated Action movies ({(mask1_orig & mask2).mean()*100:.1f}%)")
print(f"   • Success rate: {(mask1_orig & mask2).sum() / mask1_orig.sum() * 100:.1f}% of Action movies have rating > 7.5")

print("\n🔍 Why use str.split() + apply()?")
print("   • Handles multi-genre movies correctly")
print("   • Avoids false positives from partial matches")
print("   • Works with any delimiter (|, comma, semicolon, etc.)")
print("   • More reliable for categorical data analysis")

🎬 ALTERNATIVE METHODS FOR GENRE FILTERING
Method 1: Using str.contains() - LESS PRECISE
Results using str.contains(): 12 movies

Method 2: Using str.split() + apply() - MORE PRECISE (Original)
Results using split + apply: 12 movies
✅ This ensures exact genre matching

Method 3: Using str.split() + explode() + isin() - ADVANCED
Results using explode method: 12 movies

🎯 KEY INSIGHTS:
📊 Out of 298 total movies:
   • 103 are Action movies (34.6%)
   • 62 have rating > 7.5 (20.8%)
   • 12 are high-rated Action movies (4.0%)
   • Success rate: 11.7% of Action movies have rating > 7.5

🔍 Why use str.split() + apply()?
   • Handles multi-genre movies correctly
   • Avoids false positives from partial matches
   • Works with any delimiter (|, comma, semicolon, etc.)
   • More reliable for categorical data analysis


In [93]:
# Practical examples and common use cases

print("🎯 PRACTICAL APPLICATIONS OF GENRE FILTERING")
print("="*60)

# Example 1: Multiple genre filtering
print("Example 1: Movies that are BOTH Action AND Drama")
action_drama_mask = movies['genres'].str.split('|').apply(lambda x: 'Action' in x and 'Drama' in x)
action_drama_movies = movies[action_drama_mask & (movies['imdb_rating'] > 7.0)]
print(f"Found {len(action_drama_movies)} Action-Drama movies with rating > 7.0")
if len(action_drama_movies) > 0:
    print("Top examples:")
    print(action_drama_movies[['title_x', 'genres', 'imdb_rating']].head(3).to_string())

print("\n" + "-"*60)

# Example 2: Exclude certain genres
print("Example 2: Action movies that are NOT Horror")
action_not_horror = movies['genres'].str.split('|').apply(
    lambda x: 'Action' in x and 'Horror' not in x
)
result = movies[action_not_horror & (movies['imdb_rating'] > 7.5)]
print(f"Found {len(result)} Action movies (non-Horror) with rating > 7.5")

print("\n" + "-"*60)

# Example 3: Genre popularity analysis
print("Example 3: Genre popularity analysis")
all_genres = []
for genre_string in movies['genres'].dropna():
    genres = genre_string.split('|')
    all_genres.extend(genres)

from collections import Counter
genre_counts = Counter(all_genres)
print("Top 5 most common genres:")
for genre, count in genre_counts.most_common(5):
    print(f"   {genre}: {count} movies")

print("\n" + "-"*60)

# Example 4: Advanced filtering function
print("Example 4: Reusable filtering function")

def filter_movies_by_genre_and_rating(df, target_genre, min_rating=7.0, exclude_genres=None):
    """
    Filter movies by genre and rating with optional exclusions
    
    Parameters:
    df: DataFrame with movies data
    target_genre: Genre to include (e.g., 'Action')
    min_rating: Minimum IMDB rating (default: 7.0)
    exclude_genres: List of genres to exclude (optional)
    
    Returns: Filtered DataFrame
    """
    # Include target genre
    mask = df['genres'].str.split('|').apply(lambda x: target_genre in x)
    
    # Rating filter
    mask = mask & (df['imdb_rating'] >= min_rating)
    
    # Exclude genres if specified
    if exclude_genres:
        for exclude_genre in exclude_genres:
            mask = mask & ~df['genres'].str.split('|').apply(lambda x: exclude_genre in x)
    
    return df[mask]

# Test the function
comedy_movies = filter_movies_by_genre_and_rating(
    movies, 
    target_genre='Comedy', 
    min_rating=7.5, 
    exclude_genres=['Horror']
)
print(f"Comedy movies (rating ≥ 7.5, no Horror): {len(comedy_movies)}")

print("\n🔧 Technical Notes:")
print("• lambda functions enable complex conditions")
print("• & operator requires parentheses: (condition1) & (condition2)")
print("• ~ operator negates conditions (NOT operator)")
print("• apply() works element-wise on Series/DataFrame")
print("• Always handle NaN values with dropna() or na=False parameter")

🎯 PRACTICAL APPLICATIONS OF GENRE FILTERING
Example 1: Movies that are BOTH Action AND Drama
Found 25 Action-Drama movies with rating > 7.0
Top examples:
                     title_x                 genres  imdb_rating
96          Raid (2018 film)     Action|Crime|Drama          7.4
106                    Raazi  Action|Drama|Thriller          7.8
112  Bhavesh Joshi Superhero           Action|Drama          7.6

------------------------------------------------------------
Example 2: Action movies that are NOT Horror
Found 12 Action movies (non-Horror) with rating > 7.5

------------------------------------------------------------
Example 3: Genre popularity analysis
Top 5 most common genres:
   Drama: 224 movies
   Action: 103 movies
   Comedy: 83 movies
   Romance: 69 movies
   Crime: 58 movies

------------------------------------------------------------
Example 4: Reusable filtering function
Comedy movies (rating ≥ 7.5, no Horror): 13

🔧 Technical Notes:
• lambda functions enable com

In [48]:
# Function to analyze head-to-head record between two teams
# Would filter matches where Team1 and Team2 are the specified teams
# and count wins for each team
# write a function that can return the track record of 2 teams against each other

### Adding new cols

In [49]:
# Adding completely new column with constant value
# movies['Country'] = 'India' assigns same value to all rows
# Useful for adding metadata or categorization
movies['Country'] = 'India'
movies.head()

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date,Country
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA),India
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India),India
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,Biography|Drama,6.1,5549,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA),India
3,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,Crime|Drama,6.0,1891,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA),India
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India),India


In [50]:
# dropna() - Removes rows containing any missing values (NaN)
# inplace=True - Modifies original DataFrame instead of returning new one
# Essential data cleaning step before analysis
movies.dropna(inplace=True)

In [51]:
# Creating new column from existing data using string operations
# str.split('|') - Splits actor string by pipe separator into list
# apply(lambda x:x[0]) - Takes first element (lead actor) from each list
# Chain of operations: string → list → first element
movies['lead actor'] = movies['actors'].str.split('|').apply(lambda x:x[0])
movies.head()

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date,Country,lead actor
11,Gully Boy,tt2395469,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Gully_Boy,Gully Boy,Gully Boy,0,2019,153,Drama|Music,8.2,22440,"Gully Boy is a film about a 22-year-old boy ""M...",A coming-of-age story based on the lives of st...,Apna Time Aayega!,Ranveer Singh|Alia Bhatt|Siddhant Chaturvedi|V...,6 wins & 3 nominations,14 February 2019 (USA),India,Ranveer Singh
34,Yeh Hai India,tt5525846,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Yeh_Hai_India,Yeh Hai India,Yeh Hai India,0,2017,128,Action|Adventure|Drama,5.7,169,Yeh Hai India follows the story of a 25 years...,Yeh Hai India follows the story of a 25 years...,A Film for Every Indian,Gavie Chahal|Mohan Agashe|Mohan Joshi|Lom Harsh|,2 wins & 1 nomination,24 May 2019 (India),India,Gavie Chahal
37,Article 15 (film),tt10324144,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Article_15_(film),Article 15,Article 15,0,2019,130,Crime|Drama,8.3,13417,In the rural heartlands of India an upright p...,In the rural heartlands of India an upright p...,Farq Bahut Kar Liya| Ab Farq Laayenge.,Ayushmann Khurrana|Nassar|Manoj Pahwa|Kumud Mi...,1 win,28 June 2019 (USA),India,Ayushmann Khurrana
87,Aiyaary,tt6774212,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Aiyaary,Aiyaary,Aiyaary,0,2018,157,Action|Thriller,5.2,3538,General Gurinder Singh comes with a proposal t...,After finding out about an illegal arms deal ...,The Ultimate Trickery,Sidharth Malhotra|Manoj Bajpayee|Rakul Preet S...,1 nomination,16 February 2018 (USA),India,Sidharth Malhotra
96,Raid (2018 film),tt7363076,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Raid_(2018_film),Raid,Raid,0,2018,122,Action|Crime|Drama,7.4,13159,Set in the 80s in Uttar Pradesh India Raid i...,A fearless income tax officer raids the mansio...,Heroes don't always come in uniform,Ajay Devgn|Saurabh Shukla|Ileana D'Cruz|Amit S...,2 wins & 3 nominations,16 March 2018 (India),India,Ajay Devgn


In [52]:
# info() - Checking DataFrame structure after adding new columns and dropping nulls
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 298 entries, 11 to 1623
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title_x           298 non-null    object 
 1   imdb_id           298 non-null    object 
 2   poster_path       298 non-null    object 
 3   wiki_link         298 non-null    object 
 4   title_y           298 non-null    object 
 5   original_title    298 non-null    object 
 6   is_adult          298 non-null    int64  
 7   year_of_release   298 non-null    int64  
 8   runtime           298 non-null    object 
 9   genres            298 non-null    object 
 10  imdb_rating       298 non-null    float64
 11  imdb_votes        298 non-null    int64  
 12  story             298 non-null    object 
 13  summary           298 non-null    object 
 14  tagline           298 non-null    object 
 15  actors            298 non-null    object 
 16  wins_nominations  298 non-null    object 
 17  

### Important DataFrame Functions

In [53]:
# info() - Checking current data types before type conversion
ipl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               950 non-null    int64  
 1   City             899 non-null    object 
 2   Date             950 non-null    object 
 3   Season           950 non-null    object 
 4   MatchNumber      950 non-null    object 
 5   Team1            950 non-null    object 
 6   Team2            950 non-null    object 
 7   Venue            950 non-null    object 
 8   TossWinner       950 non-null    object 
 9   TossDecision     950 non-null    object 
 10  SuperOver        946 non-null    object 
 11  WinningTeam      946 non-null    object 
 12  WonBy            950 non-null    object 
 13  Margin           932 non-null    float64
 14  method           19 non-null     object 
 15  Player_of_Match  946 non-null    object 
 16  Team1Players     950 non-null    object 
 17  Team2Players    

In [54]:
# astype() - Converts column data type to save memory
# 'int32' uses less memory than default 'int64' for smaller integers
# Important for memory optimization in large datasets
ipl['ID'] = ipl['ID'].astype('int32')

In [55]:
# info() - Verifying data type changes and memory usage reduction
ipl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               950 non-null    int32  
 1   City             899 non-null    object 
 2   Date             950 non-null    object 
 3   Season           950 non-null    object 
 4   MatchNumber      950 non-null    object 
 5   Team1            950 non-null    object 
 6   Team2            950 non-null    object 
 7   Venue            950 non-null    object 
 8   TossWinner       950 non-null    object 
 9   TossDecision     950 non-null    object 
 10  SuperOver        946 non-null    object 
 11  WinningTeam      946 non-null    object 
 12  WonBy            950 non-null    object 
 13  Margin           932 non-null    float64
 14  method           19 non-null     object 
 15  Player_of_Match  946 non-null    object 
 16  Team1Players     950 non-null    object 
 17  Team2Players    

In [56]:
# Converting to 'category' data type for string columns with limited unique values
# Categories save memory by storing unique values once and using integers internally
# Ideal for columns with repeated string values (team names, etc.)
# ipl['Season'] = ipl['Season'].astype('category')  # Commented out
ipl['Team1'] = ipl['Team1'].astype('category')
ipl['Team2'] = ipl['Team2'].astype('category')

In [57]:
# info() - Checking memory optimization after converting to category type
ipl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ID               950 non-null    int32   
 1   City             899 non-null    object  
 2   Date             950 non-null    object  
 3   Season           950 non-null    object  
 4   MatchNumber      950 non-null    object  
 5   Team1            950 non-null    category
 6   Team2            950 non-null    category
 7   Venue            950 non-null    object  
 8   TossWinner       950 non-null    object  
 9   TossDecision     950 non-null    object  
 10  SuperOver        946 non-null    object  
 11  WinningTeam      946 non-null    object  
 12  WonBy            950 non-null    object  
 13  Margin           932 non-null    float64 
 14  method           19 non-null     object  
 15  Player_of_Match  946 non-null    object  
 16  Team1Players     950 non-null    object  
 1

In [58]:
# value_counts() - Counts frequency of unique values in a Series
# Returns Series with unique values as index and their counts as values
# Automatically sorts by count in descending order
# Example: ipl['WinningTeam'].value_counts() would show wins per team

In [59]:
# value_counts() application - Finding player with most Player of the Match awards
# Would first filter for finals/qualifiers, then count POTM winners
# Example: ipl[ipl['MatchNumber'].isin(['Final','Qualifier 1'])]['Player_of_Match'].value_counts()
# find which player has won most potm -> in finals and qualifiers

In [60]:
# value_counts() for plotting - Analyzing toss decision patterns
# ipl['TossDecision'].value_counts() shows frequency of 'bat' vs 'field' decisions
# Can be used with plotting libraries for visualization
# Toss decision plot

In [61]:
# value_counts() for team analysis - Counting matches played by each team
# Need to combine Team1 and Team2 columns to get total matches per team
# pd.concat([ipl['Team1'], ipl['Team2']]).value_counts() would show total matches
# how many matches each team has played

In [62]:
# sort_values() - Sorts DataFrame by one or more columns
# Parameters:
# - ascending=True/False - Sort order (default: True)
# - na_position='first'/'last' - Where to place NaN values (default: 'last')
# - inplace=True/False - Modify original DataFrame (default: False)
# - Multiple columns: by=['col1', 'col2'] for multi-level sorting
# sort_values -> ascending -> na_position -> inplace -> multiple cols