In [1]:
# Create a Pandas DataFrame from a dictionary of lists.
import pandas as pd
data = {
    'Name' : ['Ram','Shyam','Utsav'],
    'Age' : [20,22,21],
    'City' : ['Kathmandu','Butwal','Kathmandu']
}
df = pd.DataFrame(data)
print(df)

    Name  Age       City
0    Ram   20  Kathmandu
1  Shyam   22     Butwal
2  Utsav   21  Kathmandu


In [None]:
# Read a CSV file into a DataFrame and display the first 5 rows.
import pandas as pd
df = pd.read_csv("../datasets/IMDB_processed_data.csv")
df.head()

Unnamed: 0,Rank,Title,Release,Runtime,Rated,Ratings
0,1,The Shawshank Redemption,1994,2h 22m,R,9.3
1,2,The Godfather,1972,2h 55m,R,9.2
2,3,The Dark Knight,2008,2h 32m,PG-13,9.0
3,4,The Godfather Part II,1974,3h 22m,R,9.0
4,5,12 Angry Men,1957,1h 36m,Approved,9.0


In [9]:
# Get the number of rows and columns of a DataFrame.
df.shape

(250, 6)

In [24]:
# Display all column names and their data types.

print(df.columns)
print("")
print(df.dtypes) 
print("")
print(df.info())

Index(['Rank', 'Title', 'Release', 'Runtime', 'Rated', 'Ratings'], dtype='object')

Rank         int64
Title       object
Release      int64
Runtime     object
Rated       object
Ratings    float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rank     250 non-null    int64  
 1   Title    250 non-null    object 
 2   Release  250 non-null    int64  
 3   Runtime  250 non-null    object 
 4   Rated    250 non-null    object 
 5   Ratings  250 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 11.8+ KB
None


In [28]:
# Select a single column ('') and multiple columns ('', '') from a DataFrame.

print(df["Title"])  #single [] showed as a pandas series 
print(df[["Title", "Ratings"]])

0       The Shawshank Redemption
1                  The Godfather
2                The Dark Knight
3          The Godfather Part II
4                   12 Angry Men
                 ...            
245                Amores Perros
246                     The Help
247                      Rebecca
248    A Silent Voice: The Movie
249                    Andhadhun
Name: Title, Length: 250, dtype: object
                         Title  Ratings
0     The Shawshank Redemption      9.3
1                The Godfather      9.2
2              The Dark Knight      9.0
3        The Godfather Part II      9.0
4                 12 Angry Men      9.0
..                         ...      ...
245              Amores Perros      8.0
246                   The Help      8.1
247                    Rebecca      8.1
248  A Silent Voice: The Movie      8.1
249                  Andhadhun      8.2

[250 rows x 2 columns]


In [30]:
# Select the first 3 rows using iloc.
print(df.iloc[0:3]) #iloc = exclusive + integer based
print(df.loc[0:2])  #loc = inclusive + index based, if index was a,b,c,d = df.loc[a,c] will be used

   Rank                     Title  Release Runtime  Rated  Ratings
0     1  The Shawshank Redemption     1994  2h 22m      R      9.3
1     2             The Godfather     1972  2h 55m      R      9.2
2     3           The Dark Knight     2008  2h 32m  PG-13      9.0
   Rank                     Title  Release Runtime  Rated  Ratings
0     1  The Shawshank Redemption     1994  2h 22m      R      9.3
1     2             The Godfather     1972  2h 55m      R      9.2
2     3           The Dark Knight     2008  2h 32m  PG-13      9.0


In [None]:
#Find missing values in a DataFrame.
print(df.isnull().sum())

Rank       0
Title      0
Release    0
Runtime    0
Rated      0
Ratings    0
dtype: int64


In [None]:
#Replace all NaN values with "Unknown".
df_filled = df.fillna("Unknown")    #our data set has no missing value so no use of doing it

In [43]:
df.loc[2,'Release'] = None  #iloc cant use release, it becomes [2,2]
print(df.head())

   Rank                     Title  Release Runtime     Rated  Ratings
0     1  The Shawshank Redemption   1994.0  2h 22m         R      9.3
1     2             The Godfather   1972.0  2h 55m         R      9.2
2     3           The Dark Knight      NaN  2h 32m     PG-13      9.0
3     4     The Godfather Part II   1974.0  3h 22m         R      9.0
4     5              12 Angry Men   1957.0  1h 36m  Approved      9.0


In [45]:
df_filled = df.fillna("Unknown")
print(df_filled.head())

   Rank                     Title  Release Runtime     Rated  Ratings
0     1  The Shawshank Redemption   1994.0  2h 22m         R      9.3
1     2             The Godfather   1972.0  2h 55m         R      9.2
2     3           The Dark Knight  Unknown  2h 32m     PG-13      9.0
3     4     The Godfather Part II   1974.0  3h 22m         R      9.0
4     5              12 Angry Men   1957.0  1h 36m  Approved      9.0


In [52]:
df_sort_by_release = df.sort_values(by="Release", ascending=False)
df_sort_by_release.head()

Unnamed: 0,Rank,Title,Release,Runtime,Rated,Ratings
53,54,Dune: Part Two,2024.0,2h 46m,PG-13,8.5
207,208,Maharaja,2024.0,2h 21m,Not Rated,8.4
159,160,The Wild Robot,2024.0,1h 42m,PG,8.2
43,44,Spider-Man: Across the Spider-Verse,2023.0,2h 20m,PG,8.5
65,66,12th Fail,2023.0,2h 27m,Not Rated,8.8


In [55]:
df_pivot_table = df.pivot_table(index="Rated",values="Ratings",aggfunc="mean")
print(df_pivot_table)

            Ratings
Rated              
Approved   8.272727
G          8.255556
NC-17      8.300000
Not Rated  8.282759
PG         8.292308
PG-13      8.377143
Passed     8.180000
R          8.325743
