Pandas is built on top of two core Python libraries—matplotlib for data visualization and NumPy for mathematical operations. Pandas acts as a wrapper over these libraries, allowing you to access many of matplotlib's and NumPy's methods with less code.

Series

In [32]:
import pandas as pd
print(pd.__version__)

1.3.4


In [33]:
a = pd.Series([1,2,3,4],index = ['a','b','c','d'])
a

a    1
b    2
c    3
d    4
dtype: int64

Series handle 1D arrays

In [34]:
a.values

array([1, 2, 3, 4])

In [35]:
type(a.values)

numpy.ndarray

In [36]:
type(a)

pandas.core.series.Series

In [37]:
a.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [38]:
a['a']

1

In [39]:
a['b']

2

In [40]:
a['a':'c']  # using explicit index final one is also included

a    1
b    2
c    3
dtype: int64

In [41]:
gra_dicts = {'A' : 4 , 'B' : 3.5 ,'C' : 3,'D' :2.5 }
grads = pd.Series(gra_dicts)

In [42]:
grads

A    4.0
B    3.5
C    3.0
D    2.5
dtype: float64

In [43]:
grads.values

array([4. , 3.5, 3. , 2.5])

In [44]:
marks = {'A' : 90, 'B' : 80 ,'C' : 70,'D' :60 }
mar = pd.Series(marks)

In [45]:
mar

A    90
B    80
C    70
D    60
dtype: int64

In [46]:
mar.values

array([90, 80, 70, 60])

In [47]:
mar['A']

90

In [48]:
mar[0:2]   #using implicit indices final one is  not counted

A    90
B    80
dtype: int64

DATAFRAMES

In [49]:
d = pd.DataFrame({ 'marks' : mar,'grades' : grads})

In [50]:
d

Unnamed: 0,marks,grades
A,90,4.0
B,80,3.5
C,70,3.0
D,60,2.5


In [51]:
 d.T

Unnamed: 0,A,B,C,D
marks,90.0,80.0,70.0,60.0
grades,4.0,3.5,3.0,2.5


In [52]:
d.values

array([[90. ,  4. ],
       [80. ,  3.5],
       [70. ,  3. ],
       [60. ,  2.5]])

Indexing

In [53]:
d.values[0,0]

90.0

In [54]:
d.columns

Index(['marks', 'grades'], dtype='object')

In [55]:
d.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [56]:
d.head() 

Unnamed: 0,marks,grades
A,90,4.0
B,80,3.5
C,70,3.0
D,60,2.5


In [57]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   marks   4 non-null      int64  
 1   grades  4 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 268.0+ bytes


In [58]:
d.shape

(4, 2)

In [59]:
d.describe()

Unnamed: 0,marks,grades
count,4.0,4.0
mean,75.0,3.25
std,12.909944,0.645497
min,60.0,2.5
25%,67.5,2.875
50%,75.0,3.25
75%,82.5,3.625
max,90.0,4.0


In [60]:
d['scaled_marks'] = d['marks']/100

In [61]:
d

Unnamed: 0,marks,grades,scaled_marks
A,90,4.0,0.9
B,80,3.5,0.8
C,70,3.0,0.7
D,60,2.5,0.6


In [62]:
del d['scaled_marks']

In [63]:
d

Unnamed: 0,marks,grades
A,90,4.0
B,80,3.5
C,70,3.0
D,60,2.5


In [64]:
g = d[d['marks'] > 70]

In [65]:
g

Unnamed: 0,marks,grades
A,90,4.0
B,80,3.5


In [66]:
 da = pd.DataFrame([{ 'a' : 0,'b' : 1}, {'b': 2, 'c' : 4}])

In [67]:
da

Unnamed: 0,a,b,c
0,0.0,1,
1,,2,4.0


In [68]:
da.fillna(1)  #fill not available with 1

Unnamed: 0,a,b,c
0,0.0,1,1.0
1,1.0,2,4.0


Confusion b/w implicit and explicit index

In [69]:
data = pd.Series(['a','b','c'] , index =[1,3,5])

In [70]:
data

1    a
3    b
5    c
dtype: object

In [71]:
data[1]

'a'

In [72]:
data[1:3]

3    b
5    c
dtype: object

In [73]:
data.loc[1:3]  #explicit indices 

1    a
3    b
dtype: object

In [74]:
data.iloc[1:3]  #implicit indices

3    b
5    c
dtype: object

In [75]:
d

Unnamed: 0,marks,grades
A,90,4.0
B,80,3.5
C,70,3.0
D,60,2.5


In [76]:
d.iloc[1]

marks     80.0
grades     3.5
Name: B, dtype: float64

In [77]:
d.iloc[1:]

Unnamed: 0,marks,grades
B,80,3.5
C,70,3.0
D,60,2.5


In [78]:
d.iloc[:, 0]

A    90
B    80
C    70
D    60
Name: marks, dtype: int64

In [79]:
d.iloc[2,:]

marks     70.0
grades     3.0
Name: C, dtype: float64

In [101]:
d.iloc[::-1,::-1]

Unnamed: 0,name,grades,marks
D,Dale,2.5,60
C,Casper,3.0,70
B,Barnes,3.5,80
A,Maron,4.0,90


### SUBSETTING AND SORTING

In [102]:
d['name'] = ["Maron","Barnes","Casper","Dale"]
d

Unnamed: 0,marks,grades,name
A,90,4.0,Maron
B,80,3.5,Barnes
C,70,3.0,Casper
D,60,2.5,Dale


In [82]:
d_sort = d.sort_values("marks")

# Print the top few rows
print(d_sort.head())

   marks  grades    name
D     60     2.5    Dale
C     70     3.0  Casper
B     80     3.5  Barnes
A     90     4.0   Maron


In [83]:
d_sort = d.sort_values("marks",ascending = False)

print(d_sort)

   marks  grades    name
A     90     4.0   Maron
B     80     3.5  Barnes
C     70     3.0  Casper
D     60     2.5    Dale


In [84]:
d_Sort = d.sort_values(["marks", "name"], ascending=[True, False]) #marks and sorted and then for same marks , the names are sorted

print(d_Sort)

   marks  grades    name
D     60     2.5    Dale
C     70     3.0  Casper
B     80     3.5  Barnes
A     90     4.0   Maron


In [85]:
d_marks = d["marks"]
d_marks

A    90
B    80
C    70
D    60
Name: marks, dtype: int64

In [86]:
d_m_n = d[["marks", "name"]]

# Print the head of the result
print(d_m_n)

   marks    name
A     90   Maron
B     80  Barnes
C     70  Casper
D     60    Dale


In [87]:
d_g75 = d[d["marks"] > 75]

# See the result
print(d_g75)

   marks  grades    name
A     90     4.0   Maron
B     80     3.5  Barnes


In [88]:
d_a = d[d["name"] == "Aaron"]
d_a

Unnamed: 0,marks,grades,name


In [89]:
d_a = d[   (d["marks"] > 60) & ((d["name"] == "Aaron" ) | (d["name"] == "Casper")) ]
d_a

Unnamed: 0,marks,grades,name
C,70,3.0,Casper


In [90]:
liste = ["Casper", "Aaron", "Nevada", "Utah"]

# Filter for rows 
names = d["name"].isin(liste)
namess = d[d["name"].isin(liste)]

# See the result
print(names)
print(namess)

A    False
B    False
C     True
D    False
Name: name, dtype: bool
   marks  grades    name
C     70     3.0  Casper


#### SUMMARY STATISTICS

In [91]:
d

Unnamed: 0,marks,grades,name
A,90,4.0,Maron
B,80,3.5,Barnes
C,70,3.0,Casper
D,60,2.5,Dale


In [92]:
d.mean()

  d.mean()


marks     75.00
grades     3.25
dtype: float64

In [93]:
d["marks"] .mean()

75.0

In [94]:
d["grades"].median()

3.25

In [95]:
d["grades"].max()

4.0

In [96]:
d["grades"].quantile()

3.25

In [97]:
# A custom  function
import numpy as np
def iqr(column):
    return column.mean()**2
    
# Print IQR of the temperature_c column
print(d[["grades","marks"]].agg(iqr,np.median))

ValueError: No axis named <function median at 0x7fcd4bf809d0> for object type DataFrame

In [None]:
d = d.sort_values("marks")

d["cum"] = d["marks"].cumsum()

d["cmax"] = d["marks"].cummax()
print(d["cmax"])
print(d["cum"])

_______CSV_______ FILES ______

In [104]:
from sklearn.impute import SimpleImputer

In [103]:
df = pd.read_csv('E:/covid/covid_19_data.csv

SyntaxError: EOL while scanning string literal (2143366239.py, line 1)