# Pandas

In [1]:
import pandas as pd # importing the module
print(pd.__version__)

1.3.0


In [2]:
# creating a dictionary data
data = {
   'ages': [14, 18, 24, 42],
   'heights': [165, 180, 176, 184]
}  

df=pd.DataFrame(data) # converting dictionary into a DataFrame
print(type(df))
print(df)

<class 'pandas.core.frame.DataFrame'>
   ages  heights
0    14      165
1    18      180
2    24      176
3    42      184


In [3]:
# creating a dictionary 
data = {
    'ages': [14, 18, 24, 42],
    'heights': [165, 180, 176, 184]
}

# converting the dictionary into DataFrame with custom indexes instead of default numeric indexes
df = pd.DataFrame(data, index=['James', 'Bob', 'Amy', 'Dave'])
print(df)

       ages  heights
James    14      165
Bob      18      180
Amy      24      176
Dave     42      184


In [4]:
# accessing the row by index using loc[] function:
print(df.loc['Bob'])

ages        18
heights    180
Name: Bob, dtype: int64


## Indexing 

In [5]:
x = df['ages'] # pandas series of 1 column
print(type(x))
print(x)

<class 'pandas.core.series.Series'>
James    14
Bob      18
Amy      24
Dave     42
Name: ages, dtype: int64


In [6]:
y = df[['ages', 'heights']] # pandas DataFrame of two columns [same for multiple columns!]
print(type(y))
print(y)

<class 'pandas.core.frame.DataFrame'>
       ages  heights
James    14      165
Bob      18      180
Amy      24      176
Dave     42      184


## Slicing

In [7]:
# using iloc() function for slicing
data = {
    'ages': [14, 18, 24, 42],
    'heights': [165, 180, 176, 184]
}

df = pd.DataFrame(data, index=['James', 'Bob', 'Amy', 'Dave'])
print(df.iloc[2]) # third row
print(df.iloc[:3]) # first three rows of index 0, 1, and 2
print(df.iloc[1:3]) # second and third rows with index 1 and 2

ages        24
heights    176
Name: Amy, dtype: int64
       ages  heights
James    14      165
Bob      18      180
Amy      24      176
     ages  heights
Bob    18      180
Amy    24      176


In [8]:
print(df[(df['ages']>18) & (df['heights']>160)]) # selects rows where age>18 and height>160

      ages  heights
Amy     24      176
Dave    42      184


In [9]:
print(df[(df['ages']>18) | (df['heights']>160)]) # selects rows where age>18 or height>160

       ages  heights
James    14      165
Bob      18      180
Amy      24      176
Dave     42      184


# Reading Data

In [37]:
df = pd.read_csv('covid_19_india.csv')

In [38]:
df.head() # first five rows by default

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [39]:
df.describe() 

Unnamed: 0,Sno,Cured,Deaths,Confirmed
count,17786.0,17786.0,17786.0,17786.0
mean,8893.5,267997.8,3910.019454,290378.9
std,5134.520279,592802.6,10532.815815,634766.5
min,1.0,0.0,0.0,0.0
25%,4447.25,3263.5,30.0,4209.75
50%,8893.5,32932.5,556.0,38157.5
75%,13339.75,268604.8,3507.75,292118.8
max,17786.0,6094896.0,132948.0,6310194.0


In [40]:
df.tail(10) # last 10 rows

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
17776,17777,2021-08-02,8:00 AM,Puducherry,-,-,118228,1795,121005
17777,17778,2021-08-02,8:00 AM,Punjab,-,-,582332,16294,599130
17778,17779,2021-08-02,8:00 AM,Rajasthan,-,-,944484,8954,953688
17779,17780,2021-08-02,8:00 AM,Sikkim,-,-,22955,345,26754
17780,17781,2021-08-02,8:00 AM,Tamil Nadu,-,-,2506961,34102,2561587
17781,17782,2021-08-02,8:00 AM,Telangana,-,-,632728,3805,645406
17782,17783,2021-08-02,8:00 AM,Tripura,-,-,74875,755,78722
17783,17784,2021-08-02,8:00 AM,Uttarakhand,-,-,334190,7362,342161
17784,17785,2021-08-02,8:00 AM,Uttar Pradesh,-,-,1685049,22763,1708476
17785,17786,2021-08-02,8:00 AM,West Bengal,-,-,1499597,18149,1528720


In [41]:
df.info() # gives essential information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17786 entries, 0 to 17785
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Sno                       17786 non-null  int64 
 1   Date                      17786 non-null  object
 2   Time                      17786 non-null  object
 3   State/UnionTerritory      17786 non-null  object
 4   ConfirmedIndianNational   17786 non-null  object
 5   ConfirmedForeignNational  17786 non-null  object
 6   Cured                     17786 non-null  int64 
 7   Deaths                    17786 non-null  int64 
 8   Confirmed                 17786 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


In [42]:
y = df.set_index('Date')  # sets date column as our index
df.head() # doesn't change the df

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [43]:
y.head()

Unnamed: 0_level_0,Sno,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-30,1,6:00 PM,Kerala,1,0,0,0,1
2020-01-31,2,6:00 PM,Kerala,1,0,0,0,1
2020-02-01,3,6:00 PM,Kerala,2,0,0,0,2
2020-02-02,4,6:00 PM,Kerala,3,0,0,0,3
2020-02-03,5,6:00 PM,Kerala,3,0,0,0,3


In [36]:
df.set_index('Date', inplace=True) # sets date column as our index
# The inplace = True argument specifies that the change will be applied to our DataFrame, without the need to assign it to a new DataFrame variable.
df.head()

Unnamed: 0_level_0,Sno,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-30,1,6:00 PM,Kerala,1,0,0,0,1
2020-01-31,2,6:00 PM,Kerala,1,0,0,0,1
2020-02-01,3,6:00 PM,Kerala,2,0,0,0,2
2020-02-02,4,6:00 PM,Kerala,3,0,0,0,3
2020-02-03,5,6:00 PM,Kerala,3,0,0,0,3


## Dropping a Column/Row
### drop() deletes rows and columns.
### axis = 1 specifies that we want to drop a column.
### axis = 0 will drop a row


In [44]:
df.drop('Sno', axis=1, inplace=True) # drops the column 'Sno'
df.head()

Unnamed: 0,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


# Creating Columns

In [50]:
# add column 'Month' based on date
df = pd.read_csv("https://www.sololearn.com/uploads/ca-covid.csv")

df.drop('state', axis=1, inplace=True)
print(df.head(), '\n')

df['month'] = pd.to_datetime(df['date'], format="%d.%m.%y").dt.month_name()
df.set_index('date', inplace=True)
print(df.head())

       date  cases  deaths
0  25.01.20      1       0
1  26.01.20      1       0
2  27.01.20      0       0
3  28.01.20      0       0
4  29.01.20      0       0 

          cases  deaths    month
date                            
25.01.20      1       0  January
26.01.20      1       0  January
27.01.20      0       0  January
28.01.20      0       0  January
29.01.20      0       0  January


In [51]:
df['month'].value_counts() # frequencies of each month

March        31
May          31
July         31
August       31
October      31
December     31
April        30
June         30
September    30
November     30
February     29
January       7
Name: month, dtype: int64

In [52]:
# groupby() ---> to group our dataset by the given column
print(df.groupby('month')['cases'].sum()) # total cases in each month

month
April          41887
August        210268
December     1070577
February          25
January            3
July          270120
June          119039
March           8555
May            62644
November      301944
October       114123
September     108584
Name: cases, dtype: int64


In [53]:
# total cases
df['cases'].sum()
# mean(), max(), min()

2307769

# final Project