## About Numpy

In [4]:
import numpy as np

In [5]:
arr1 = np.array([1, 2, 3, 6, 8])
print(arr1)
print(type(arr1))

print(arr1.shape)

arr1 = arr1.reshape((1, 5))
print(arr1.shape)

[1 2 3 6 8]
<class 'numpy.ndarray'>
(5,)
(1, 5)


In [6]:
arr2 = np.arange(0, 8, 2).reshape(1, 4)
print(arr2)

[[0 2 4 6]]


In [7]:
np.ones((3, 5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [8]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [9]:
print(arr1.ndim)
print(arr2.ndim)

2
2


In [10]:
print(arr1.dtype)

int64


In [11]:
print(arr1.itemsize)
print(arr1)

8
[[1 2 3 6 8]]


In [12]:
# Numpy vectorized operations

arr1 = np.array([1, 4, 6, 8])
arr2 = np.array([3, 6, 8, 2])

print(arr1 + arr2)
print(arr1 - arr2)
print(arr1 * arr2)
print(arr1 / arr2)

[ 4 10 14 10]
[-2 -2 -2  6]
[ 3 24 48 16]
[0.33333333 0.66666667 0.75       4.        ]


In [13]:
# Some universal functions

arr = np.array([1, 2, 4, 8])

print(np.sqrt(arr))
print(np.exp(arr))
print(np.sin(arr))
print(np.log(arr))

[1.         1.41421356 2.         2.82842712]
[2.71828183e+00 7.38905610e+00 5.45981500e+01 2.98095799e+03]
[ 0.84147098  0.90929743 -0.7568025   0.98935825]
[0.         0.69314718 1.38629436 2.07944154]


In [14]:
arr = np.array([[1, 2, 3, 4],[5, 6, 7, 8],[9, 10, 11, 12]])

# print(arr)

print(arr[0:2, 1:3]) # Perform slicing

[[2 3]
 [6 7]]


In [15]:
print(arr)

arr[1:] = -20

print(arr)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[  1   2   3   4]
 [-20 -20 -20 -20]
 [-20 -20 -20 -20]]


In [16]:
data = np.array([1, 2, 3, 4, 5])

mean = np.mean(data)
# print(mean)

std_dev = np.std(data)
# print(std_dev)

normalized_data = (data - mean) / std_dev
print(normalized_data)

varience = np.var(data)
print(varience)


[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
2.0


In [17]:
# Logical operations

data = np.arange(1, 11)
print(data)

print(data[(data>5) & (data<=8)])

[ 1  2  3  4  5  6  7  8  9 10]
[6 7 8]


## About Pandas

In [18]:
import pandas as pd

In [19]:
data = [1, 2, 3, 4, 5]

# Creating series from 1-D array.
series = pd.Series(data)

print(data)
print(series)
print(type(series))

[1, 2, 3, 4, 5]
0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [20]:
# Creating series from dictionary

data = {'a': 1, 'b': 2, 'c': 3}
series = pd.Series(data)

print(series)     


a    1
b    2
c    3
dtype: int64


In [21]:
data = [29, 47, 16]
index = ['aa', 'bb', 'cc']

series = pd.Series(data, index=index)

print(series)

aa    29
bb    47
cc    16
dtype: int64


In [22]:
# About dataframe

## Creating a dataframe through dictionary
import pandas as pd

data = {
      'Name' : ['Keshav', 'Mayank', "Raju"],
      'Age' : [39, 28, 63],
      'City' : ['Kanpur', 'Lucknow', 'Goa']
}

dataframe = pd.DataFrame(data)
dataframe


Unnamed: 0,Name,Age,City
0,Keshav,39,Kanpur
1,Mayank,28,Lucknow
2,Raju,63,Goa


In [23]:
## Creating a dataframe through list of dictionaries

data = [
      {'Name': 'Krish', 'Age': 32, "City": 'Banglore '},
      {'Name': 'Krish', 'Age': 32, "City": 'Banglore '},
      {'Name': 'Krish', 'Age': 32, "City": 'Banglore '},
      {'Name': 'Krish', 'Age': 32, "City": 'Banglore '}
]

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Krish,32,Banglore
1,Krish,32,Banglore
2,Krish,32,Banglore
3,Krish,32,Banglore


In [24]:
df = pd.read_csv('diabetes.csv')

# df.head(7)
df.tail(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [25]:
# How to access data from dataframe.

df = pd.DataFrame(data)
print(df)

# df['Name'] # It is actually a series (It prints entire column)

df.loc[0]

df.iloc[1][2]

    Name  Age       City
0  Krish   32  Banglore 
1  Krish   32  Banglore 
2  Krish   32  Banglore 
3  Krish   32  Banglore 


  df.iloc[1][2]


'Banglore '

In [26]:
df

print(df.at[2, 'City'])

Banglore 


In [27]:
print(df.iat[2, 2])

Banglore 


In [28]:
df

# Performing data manipulation with dataframe.

# Adding a column
df['Salary'] = [5000, 2894, 29324, 32824]
print(df)

# Removing a column
df.drop('Salary', axis=1, inplace=True)
print(df)

    Name  Age       City  Salary
0  Krish   32  Banglore     5000
1  Krish   32  Banglore     2894
2  Krish   32  Banglore    29324
3  Krish   32  Banglore    32824
    Name  Age       City
0  Krish   32  Banglore 
1  Krish   32  Banglore 
2  Krish   32  Banglore 
3  Krish   32  Banglore 


In [29]:
# Update the value of whole column

print(df)

df['Age'] = df['Age'] + 1
print(df)

    Name  Age       City
0  Krish   32  Banglore 
1  Krish   32  Banglore 
2  Krish   32  Banglore 
3  Krish   32  Banglore 
    Name  Age       City
0  Krish   33  Banglore 
1  Krish   33  Banglore 
2  Krish   33  Banglore 
3  Krish   33  Banglore 


In [30]:
# How to drop the whole row.

# print(df)

# df.drop(1, axis=0, inplace=True)

# print(df)

print(df.describe())

        Age
count   4.0
mean   33.0
std     0.0
min    33.0
25%    33.0
50%    33.0
75%    33.0
max    33.0


## Data manipulation and analysis with pandas.

In [31]:
import pandas as pd

In [32]:
df = pd.read_csv('data.csv')

# df.describe()
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

In [33]:
# Handling missing values

df.isnull().any(axis=0)

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [34]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [35]:
# Filling missing values with 0.

print(df.isnull().any())

df_filled = df.fillna(0)

df_filled.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool


Date        False
Category    False
Value       False
Product     False
Sales       False
Region      False
dtype: bool

In [36]:
df

df.isnull().any()

df['Value_updated'] = df['Value'].fillna(df['Value'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Value_updated
0,2023-01-01,A,28.0,Product1,754.0,East,28.0
1,2023-01-02,B,39.0,Product3,110.0,North,39.0
2,2023-01-03,C,32.0,Product2,398.0,East,32.0
3,2023-01-04,B,8.0,Product1,522.0,East,8.0
4,2023-01-05,B,26.0,Product3,869.0,North,26.0
5,2023-01-06,B,54.0,Product3,192.0,West,54.0
6,2023-01-07,A,16.0,Product1,936.0,East,16.0
7,2023-01-08,C,89.0,Product1,488.0,West,89.0
8,2023-01-09,C,37.0,Product3,772.0,West,37.0
9,2023-01-10,A,22.0,Product2,834.0,West,22.0


In [37]:
# How to rename columns in dataframe.Age

df = df.rename(columns={'Sales': 'Sales_Data'})
df.head(10)

Unnamed: 0,Date,Category,Value,Product,Sales_Data,Region,Value_updated
0,2023-01-01,A,28.0,Product1,754.0,East,28.0
1,2023-01-02,B,39.0,Product3,110.0,North,39.0
2,2023-01-03,C,32.0,Product2,398.0,East,32.0
3,2023-01-04,B,8.0,Product1,522.0,East,8.0
4,2023-01-05,B,26.0,Product3,869.0,North,26.0
5,2023-01-06,B,54.0,Product3,192.0,West,54.0
6,2023-01-07,A,16.0,Product1,936.0,East,16.0
7,2023-01-08,C,89.0,Product1,488.0,West,89.0
8,2023-01-09,C,37.0,Product3,772.0,West,37.0
9,2023-01-10,A,22.0,Product2,834.0,West,22.0


In [38]:
# How to change the datatypes in df.

df['Value_New'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df

Unnamed: 0,Date,Category,Value,Product,Sales_Data,Region,Value_updated,Value_New
0,2023-01-01,A,28.0,Product1,754.0,East,28.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,39.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,32.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,8.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,26.0,26
5,2023-01-06,B,54.0,Product3,192.0,West,54.0,54
6,2023-01-07,A,16.0,Product1,936.0,East,16.0,16
7,2023-01-08,C,89.0,Product1,488.0,West,89.0,89
8,2023-01-09,C,37.0,Product3,772.0,West,37.0,37
9,2023-01-10,A,22.0,Product2,834.0,West,22.0,22


In [39]:
# How to do modification in the data of the column.

df.head(6)

df['Updated_Value'] = df['Value'].apply(lambda x: x*2)
df.head(6)

Unnamed: 0,Date,Category,Value,Product,Sales_Data,Region,Value_updated,Value_New,Updated_Value
0,2023-01-01,A,28.0,Product1,754.0,East,28.0,28,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,39.0,39,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,32.0,32,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,8.0,8,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,26.0,26,52.0
5,2023-01-06,B,54.0,Product3,192.0,West,54.0,54,108.0


In [42]:
# How to use groupby.

df.groupby(['Product', 'Region'])['Value'].mean()

Product   Region
Product1  East      41.714286
          North      4.500000
          South     50.000000
          West      82.000000
Product2  East      28.000000
          North     63.500000
          South     60.333333
          West      53.500000
Product3  East      50.500000
          North     40.600000
          South     71.666667
          West      62.166667
Name: Value, dtype: float64

In [44]:
# aggregrate multiple functions

grouped_agg = df.groupby(['Region'])['Value'].agg(['mean', 'sum', 'count'])

grouped_agg

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,42.307692,550.0,13
North,37.666667,339.0,9
South,62.0,496.0,8
West,61.588235,1047.0,17


In [50]:
# Merging and Joining dataframes.

df1 = pd.DataFrame({'Key': ['A', 'B', 'C'], 'Value1': [1, 2, 3]})
df2 = pd.DataFrame({'Key': ['A', 'B', 'D'], 'Value1': [4, 5, 6]})


In [51]:
pd.merge(df1, df2, on='Key', how="inner")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1,4
1,B,2,5


In [52]:

pd.merge(df1, df2, on='Key', how="outer")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [54]:

pd.merge(df1, df2, on='Key', how="left")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [55]:

pd.merge(df1, df2, on='Key', how="right")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1.0,4
1,B,2.0,5
2,D,,6


## Reading data from different sources.



In [58]:
import pandas as pd

from io import StringIO
Data = '{"employee_name": "James", "email": "james@gmail.com", "job_profile": [{"title1":"Team Lead", "title2":"Sr. Developer"}]}'
df=pd.read_json(StringIO(Data))

df

Unnamed: 0,employee_name,email,job_profile
0,James,james@gmail.com,"{'title1': 'Team Lead', 'title2': 'Sr. Develop..."


In [60]:
# df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)

In [88]:
url="https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/"

df=pd.read_html(url)

df[0]

Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
1,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,"First Bank & Trust Co., Duncan, OK","October 18, 2024",10547
2,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
3,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
4,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
5,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
6,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
7,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
8,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538
9,First City Bank of Florida,Fort Walton Beach,Florida,16748,"United Fidelity Bank, fsb","October 16, 2020",10537


In [71]:
!pip install lxml
!pip install html5lib
!pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Downloading beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Downloading typing_extensions-4.13.2-py3-none-any.whl (45 kB)
Installing collected packages: typing-extensions, soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.3 soupsieve-2.6 typing-extensions-4.13.2


In [66]:
df[0]

Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
1,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,"First Bank & Trust Co., Duncan, OK","October 18, 2024",10547
2,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
3,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
4,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
5,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
6,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
7,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
8,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538
9,First City Bank of Florida,Fort Walton Beach,Florida,16748,"United Fidelity Bank, fsb","October 16, 2020",10537


In [79]:
url = "https://www.mospi.gov.in/download-reports"

df=pd.read_html(url)

df=df[0]
df

Unnamed: 0,Sr. No.,Subject,Report number,Round
0,1,Key Employment Unemployment Indicators PLFS 20...,,
1,2,Women & Men in India 2024: Selected Indicators...,,
2,3,Time Use in India 2024( 18.69 MB ),,
3,4,Energy Statistics India 2025,,
4,5,NSSTA's Report on Statistical Training needs A...,,
5,6,Diamond Jubilee Publications on evolution of H...,,
6,7,Diamond Jubilee Publications on evolution Ente...,,
7,8,EnviStats India - Glossary( 1.12 MB ),,
8,9,"Sarvekshana, 117th issue, September, 2024( 9.6...",117.0,
9,10,Know Your Ministry- Module for Ministry of Sta...,,


In [87]:
df

df.isnull().any(axis=1)

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
9    True
dtype: bool