In [4]:
# What Is Pandas?

In [5]:
# Pandas is one of the most popular data science libraries in Python. Easy to use,
# it is built on top of NumPy and shares many functions and properties.

# With Pandas, you can read and extract data from files, transform and analyze it,
# calculate statistics and correlations, and much more!

# To start using pandas, we need to import it first:

In [6]:
import pandas as pd

In [7]:
# pd is a common short name used when importing the library.

In [8]:
# Pandas is derived from the term "panel data",
# an econometrics term for data sets that include observations over multiple time periods for the same individuals.

In [9]:
# Series & DataFrames:

In [10]:
# The two primary components of pandas are the Series and the DataFrame.

# A Series is essentially a column, and a DataFrame is a multi-dimensional table made up of a collection of Series.

# For example, the following DataFrame is made of two Series, ages and heights

<img src="https://api.sololearn.com/DownloadFile?id=4554"/>

In [11]:
# You can think of a Series as a one-dimensional array, while a DataFrame is a multi-dimensional array.

In [12]:
# DataFrames

In [13]:
# Before working with real data, let's first create a DataFrame manually to explore its functions.
# The easiest way to create a DataFrame is using a dictionary:

In [14]:
data = {
    'ages': [14, 18, 24, 42],
    'height': [165, 180, 176, 184],
}

In [15]:
# Each key is a column, while the value is an array representing the data for that column.

# Now, we can pass this dictionary to the DataFrame constructor:

In [16]:
df = pd.DataFrame(data);

In [17]:
# Run the code to see the resulting DataFrame.

In [18]:
print(df)

   ages  height
0    14     165
1    18     180
2    24     176
3    42     184


In [19]:
# The DataFrame automatically creates a numeric index for each row.
# We can specify a custom index, when creating the DataFrame:

In [20]:
df = pd.DataFrame(data, index=[1, 2, 3, 4])

In [21]:
print(df)

   ages  height
1    14     165
2    18     180
3    24     176
4    42     184


In [22]:
# Now we can access a row using its index and the loc[] function:

In [23]:
print(df.loc[2])

ages       18
height    180
Name: 2, dtype: int64


In [24]:
# This will output the row that corresponds to the index "2".

In [25]:
# Note, that loc uses square brackets to specify the index.

In [26]:
# Indexing: 

In [27]:
# We can select a single column by specifying its name in square brackets:

In [28]:
print(df["ages"])

1    14
2    18
3    24
4    42
Name: ages, dtype: int64


In [29]:
# The result is a Series object.

In [30]:
# If we want to select multiple columns, we can specify a list of column names:

In [31]:
print(df[["ages", "height"]])

   ages  height
1    14     165
2    18     180
3    24     176
4    42     184


In [32]:
# This time, the result is a DataFrame, as it includes multiple columns.

In [33]:
# This is useful, when we need to select only a part of the columns from the dataset.

In [34]:
# Slicing: 

In [35]:
# Pandas uses the iloc function to select data based on its numeric index.
# It works the same way indexing lists does in Python.

In [36]:
# third row:

In [37]:
print(df.iloc[2])

ages       24
height    176
Name: 3, dtype: int64


In [38]:
# first three rows:

In [39]:
print(df.iloc[:3])

   ages  height
1    14     165
2    18     180
3    24     176


In [40]:
print(df.iloc[1:3])

   ages  height
2    18     180
3    24     176


In [41]:
# iloc follows the same rules as slicing does with Python lists.

In [42]:
# Conditions: 

In [43]:
# We can also select the data based on a condition.
# For example, let's select all rows where age is greater than 18 and height is greater than 180:

In [44]:
print(df[(df["ages"] > 18) & (df["height"] > 180)])

   ages  height
4    42     184


In [45]:
# Similarly, the or | operator can be used to combine conditions.

In [46]:
# Reading Data: 

In [47]:
# It is quite common for data to come in a file format. One of the most popular formats is the CSV (comma-separated values).
# Pandas supports reading data from a CSV file directly into a DataFrame.

# For our examples, we will use a CSV file that contains the COVID-19 infection data in California for the year 2020, called 'ca-covid.csv'.

# The read_csv() function reads the data of a CSV file into a DataFrame:

In [48]:
df = pd.read_csv("C:/Users/User/Downloads/prac.csv")

In [49]:
print(df)

    Name:  Roll No:         Education:          City:  Gender:       Date:  \
0    Mark      1876         SCC-Part 1         Karachi    Male  16-12-2021   
1     Ace      1877         SSC-Part 2       Hyderabad    Male  16-12-2021   
2   Myers      1878           BA-Part2          Lahore    Male  16-12-2021   
3   Joker      1879           BA-Part1       Islamabad    Male  16-12-2021   
4    Umer      1880         University          Punjab    Male  16-12-2021   
5    Emma      1881                 CA           Dubai  Female  16-12-2021   
6    Liam      1882            Bachlar       Hong Kong    Male  16-12-2021   
7   James      1883    Web Development  FaisalabadMale     NaN  16-12-2021   
8  Elijah      1884  Android Developer       Nazamabad    Male  16-12-2021   

     Month:  Cases:  
0      June       2  
1  February       3  
2     March       4  
3     April      23  
4       May      42  
5      June      32  
6      July       3  
7  February       3  
8      June      12  

In [50]:
# We need to provide the file path to the read_csv() function.

In [51]:
# Pandas also supports reading from JSON files, as well as SQL databases.

In [52]:
# Once we have the data in a DataFrame, we can start exploring it.
# We can get the first rows of the data using the head() function of the DataFrame:

In [53]:
print(df.head())

   Name:  Roll No:  Education:     City:  Gender:       Date:    Month:  \
0   Mark      1876  SCC-Part 1    Karachi    Male  16-12-2021      June   
1    Ace      1877  SSC-Part 2  Hyderabad    Male  16-12-2021  February   
2  Myers      1878    BA-Part2     Lahore    Male  16-12-2021     March   
3  Joker      1879    BA-Part1  Islamabad    Male  16-12-2021     April   
4   Umer      1880  University     Punjab    Male  16-12-2021       May   

   Cases:  
0       2  
1       3  
2       4  
3      23  
4      42  


In [54]:
# By default it returns the first 5 rows. You can instruct it to return the number
# of rows you would like as an argument (for example, df.head(10) will return the first 10 rows).

In [55]:
print(df.head(2))

  Name:  Roll No:  Education:     City:  Gender:       Date:    Month:  Cases:
0  Mark      1876  SCC-Part 1    Karachi    Male  16-12-2021      June       2
1   Ace      1877  SSC-Part 2  Hyderabad    Male  16-12-2021  February       3


In [56]:
# We can see that our DataFrame contains the date, state, number of cases and deaths for that date.

In [57]:
# Similarly, you can get the last rows using the tail() function. For Example: 

In [58]:
print(df.tail())

    Name:  Roll No:         Education:          City:  Gender:       Date:  \
4    Umer      1880         University          Punjab    Male  16-12-2021   
5    Emma      1881                 CA           Dubai  Female  16-12-2021   
6    Liam      1882            Bachlar       Hong Kong    Male  16-12-2021   
7   James      1883    Web Development  FaisalabadMale     NaN  16-12-2021   
8  Elijah      1884  Android Developer       Nazamabad    Male  16-12-2021   

     Month:  Cases:  
4       May      42  
5      June      32  
6      July       3  
7  February       3  
8      June      12  


In [59]:
print(df.tail(2))

    Name:  Roll No:         Education:          City:  Gender:       Date:  \
7   James      1883    Web Development  FaisalabadMale     NaN  16-12-2021   
8  Elijah      1884  Android Developer       Nazamabad    Male  16-12-2021   

     Month:  Cases:  
7  February       3  
8      June      12  


In [60]:
# The info() function is used to get essential information about your dataset,
# such as number of rows, columns, data types, etc:

In [61]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name:       9 non-null      object
 1   Roll No:    9 non-null      int64 
 2   Education:  9 non-null      object
 3   City:       9 non-null      object
 4   Gender:     8 non-null      object
 5   Date:       9 non-null      object
 6   Month:      9 non-null      object
 7   Cases:      9 non-null      int64 
dtypes: int64(2), object(6)
memory usage: 704.0+ bytes
None


In [62]:
# Run the code to see the result!

In [63]:
# From the result, we can see that our dataset contains 9 rows and 5 columns: Name:, Roll No:, Education:, City:, Gender:.

# We also see that Pandas has added an auto generated index.
# We can set our own index column by using the set_index() function:

In [64]:
df.set_index("Roll No:", inplace=True)

In [65]:
print(df)

           Name:         Education:          City:  Gender:       Date:  \
Roll No:                                                                  
1876        Mark         SCC-Part 1         Karachi    Male  16-12-2021   
1877         Ace         SSC-Part 2       Hyderabad    Male  16-12-2021   
1878       Myers           BA-Part2          Lahore    Male  16-12-2021   
1879       Joker           BA-Part1       Islamabad    Male  16-12-2021   
1880        Umer         University          Punjab    Male  16-12-2021   
1881        Emma                 CA           Dubai  Female  16-12-2021   
1882        Liam            Bachlar       Hong Kong    Male  16-12-2021   
1883       James    Web Development  FaisalabadMale     NaN  16-12-2021   
1884      Elijah  Android Developer       Nazamabad    Male  16-12-2021   

            Month:  Cases:  
Roll No:                    
1876          June       2  
1877      February       3  
1878         March       4  
1879         April      23  


In [66]:
# The Roll No column is a good choice for our index, as there is one row for each Roll No.

In [67]:
# The inplace=True argument specifies that the change will be applied to our DataFrame,
# without the need to assign it to a new DataFrame variable.

In [68]:
# Dropping a Column: 

In [69]:
# Since our data is only for the state of California, we can remove that column from our DataFrame, 
# as it contains the same value for all rows:

In [70]:
# Remove Columns Using Column Name.

In [71]:
df.drop("Name:", axis=1, inplace=True)

In [72]:
print(df)

                 Education:          City:  Gender:       Date:    Month:  \
Roll No:                                                                    
1876             SCC-Part 1         Karachi    Male  16-12-2021      June   
1877             SSC-Part 2       Hyderabad    Male  16-12-2021  February   
1878               BA-Part2          Lahore    Male  16-12-2021     March   
1879               BA-Part1       Islamabad    Male  16-12-2021     April   
1880             University          Punjab    Male  16-12-2021       May   
1881                     CA           Dubai  Female  16-12-2021      June   
1882                Bachlar       Hong Kong    Male  16-12-2021      July   
1883        Web Development  FaisalabadMale     NaN  16-12-2021  February   
1884      Android Developer       Nazamabad    Male  16-12-2021      June   

          Cases:  
Roll No:          
1876           2  
1877           3  
1878           4  
1879          23  
1880          42  
1881          32  


In [73]:
# drop() deletes rows and columns.
# axis=1 specifies that we want to drop a column.
# axis=0 will drop a row.

In [74]:
# Remove Rows Using Index Value

In [75]:
df.drop(1878, axis=0, inplace=True)

In [76]:
print(df)

                 Education:          City:  Gender:       Date:    Month:  \
Roll No:                                                                    
1876             SCC-Part 1         Karachi    Male  16-12-2021      June   
1877             SSC-Part 2       Hyderabad    Male  16-12-2021  February   
1879               BA-Part1       Islamabad    Male  16-12-2021     April   
1880             University          Punjab    Male  16-12-2021       May   
1881                     CA           Dubai  Female  16-12-2021      June   
1882                Bachlar       Hong Kong    Male  16-12-2021      July   
1883        Web Development  FaisalabadMale     NaN  16-12-2021  February   
1884      Android Developer       Nazamabad    Male  16-12-2021      June   

          Cases:  
Roll No:          
1876           2  
1877           3  
1879          23  
1880          42  
1881          32  
1882           3  
1883           3  
1884          12  


In [77]:
# Now our dataset is much cleaner: we have a Roll No index, and Others columns.

In [78]:
# Creating Columns: 

In [79]:
# Pandas allows us to create our own columns.

In [80]:
# For example, we can add a month column based on the date column:

In [99]:
df['month'] = pd.to_datetime(df["date"],format="%d.%m.%y").dt.month_name()

In [None]:
# We do this by converting the date column to datetime and extracting the month name from it,
# assigning the value to our new month column.

In [82]:
# Our date is in DD.MM.YY format, which is why we need to specify the format attribute.

In [83]:
# Summary Statistics:

In [84]:
# Now that our dataset is clean and set up, we are ready to look into some stats!
# The describe() function returns the summary statistics for all the numeric columns:

In [85]:
print(df.describe())

          Cases:
count   8.000000
mean   15.000000
std    15.547163
min     2.000000
25%     3.000000
50%     7.500000
75%    25.250000
max    42.000000


In [86]:
# This function will show main statistics for the numeric columns, such as std, mean, min, max values, etc.

In [87]:
# We can also get the summary stats for a single column, for example:

In [88]:
df['Date:'].describe()

count              8
unique             1
top       16-12-2021
freq               8
Name: Date:, dtype: object

In [89]:
# Grouping:

In [90]:
df['Month:'].value_counts()

June        3
February    2
May         1
April       1
July        1
Name: Month:, dtype: int64

In [91]:
# We can see that, for example, June has only 3 records, while the other months have data for all days.

In [92]:
# value_counts() returns how many times a value appears in the dataset, also called the frequency of the values.

In [93]:
# Now we can calculate data insights!
# For example, let's determine the number of total infections in each month.
# To do this, we need to group our data by the month column and then calculate the sum of the cases column for each month:

In [94]:
df.groupby('Month:')['Cases:'].sum()

Month:
April       23
February     6
July         3
June        46
May         42
Name: Cases:, dtype: int64

In [95]:
# The groupby() function is used to group our dataset by the given column.

# We can also calculate the number of total cases in the entire year:

In [96]:
df['Cases:'].sum()

120

In [97]:
# We can see that California had 120 infection cases in 2021.

In [98]:
# Similarly, we can use min(), max(), mean(), etc. to find the corresponding values for each group.

In [437]:
#                                  😍 Congratulations Course Has Been Completed 😍