<p style="font-size:30px;">Numpy and Pandas Fundamentals</p>
<p style="font-size:20px;">Introduction to Manipulating data with Numpy and Pandas</p>
<p style="font-size:15px;">We will go through</p> 
<p style="font-size:15px;">Basic Array operations, indexing & slicing, broadcasting, dataframe, reading and writing data, missing data</p>

In [43]:
# import numpy 
import numpy as np 

In [44]:


# From Python lists:
a = np.array([1, 2, 3])            # 1D array
b = np.array([[1, 2], [3, 4]])     # 2D array

# Built-in constructors:
zeros = np.zeros((3,4))            # shape = 3×4 of zeros
ones  = np.ones((2,3))             # shape = 2×3 of ones
rng   = np.arange(0, 10, 2)        # [0, 2, 4, 6, 8]
lin   = np.linspace(0, 1, 5)       # 5 evenly spaced points between 0–1
eye   = np.eye(3)                  # 3×3 identity matrix

# Random arrays:
rand  = np.random.rand(3,3)        # uniform [0,1)
norm  = np.random.randn(4)         # standard normal

print(a) 
print() 
print(b) 
print() 
print(zeros) 
print() 
print(ones) 
print() 
print(lin) 
print() 
print(eye) 
print() 

print(rand) 
print() 
print(norm) 


[1 2 3]

[[1 2]
 [3 4]]

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[[1. 1. 1.]
 [1. 1. 1.]]

[0.   0.25 0.5  0.75 1.  ]

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

[[0.31001341 0.01938281 0.82995861]
 [0.61839077 0.55492905 0.59526623]
 [0.26227209 0.69890269 0.87257205]]

[-0.44737573 -1.33752235  0.1239463   0.18818693]


In [45]:
# BASIC ARRAY OPERATIONS 

x = np.array([1,2,3])
y = np.array([4,5,6])

x + y            # array([5, 7, 9])
x * 2            # array([2, 4, 6])
x * y            # elementwise => array([4,10,18])
x.dot(y)         # dot product => 32
np.sqrt(x)       # array([1. ,1.414,1.732])
np.sum(b, axis=0)  # sum columns of b



array([4, 6])

In [46]:
# indexing and Slicing 

M = np.arange(16).reshape(4,4)
M[1,2]        # single element
M[0]          # first row
M[:,1]        # second column
M[1:3, 2:4]   # submatrix
M[M > 5]      # boolean filtering
idx = [0,2]; M[idx]  # fancy indexing selects rows 0 and 2


array([[ 0,  1,  2,  3],
       [ 8,  9, 10, 11]])

In [47]:
# Broadcasting 
A = np.ones((3,4))
v = np.array([1,2,3,4])
A + v          # adds v to each row of A

w = np.array([1,2,3]).reshape(3,1)
A + w          # adds w to each column of A


array([[2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.]])

In [48]:
# PANDAS 
import pandas as pd

# Series: 1D labeled array
s = pd.Series([10,20,30], index=['a','b','c'])

# DataFrame: 2D table
df = pd.DataFrame({
    'name': ['Alice','Bob','Charlie'],
    'age': [25, 30, 35],
    'score': [85.5, 92.0, 88.0]
}, index=[101,102,103])

df['age']       # Series of ages
df.age          # shorthand
df.loc[102]     # row by label
df.iloc[0]      # row by integer position



name     Alice
age         25
score     85.5
Name: 101, dtype: object

In [49]:
# DataFrame 

df 

Unnamed: 0,name,age,score
101,Alice,25,85.5
102,Bob,30,92.0
103,Charlie,35,88.0


In [50]:
# Reading data from a CSV file 

In [51]:
import pandas as pd
import numpy as np

# (1) Load CSV, parse any date columns
df = pd.read_csv('sample.csv', parse_dates=['date'])

# dataframe 
df = pd.DataFrame(df) 
df.head(10) 


Unnamed: 0,date,store,product,units,revenue
0,2025-01-01,A,Widget,13,823.63
1,2025-01-01,A,Gadget,1,64.25
2,2025-01-01,B,Widget,4,345.01
3,2025-01-01,B,Gadget,19,938.27
4,2025-01-02,A,Widget,7,105.73
5,2025-01-02,A,Gadget,13,578.63
6,2025-01-02,B,Widget,7,581.67
7,2025-01-02,B,Gadget,15,916.86
8,2025-01-03,A,Widget,6,511.48
9,2025-01-03,A,Gadget,9,160.57


In [52]:
# last 3 rows 
df.tail(3) 

Unnamed: 0,date,store,product,units,revenue
37,2025-01-10,A,Gadget,10,954.61
38,2025-01-10,B,Widget,6,67.33
39,2025-01-10,B,Gadget,18,1537.47


In [53]:
# datatypes and information 
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     40 non-null     datetime64[ns]
 1   store    40 non-null     object        
 2   product  40 non-null     object        
 3   units    40 non-null     int64         
 4   revenue  40 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 1.7+ KB


In [54]:
# summary statistics 
df.describe() 

Unnamed: 0,date,units,revenue
count,40,40.0,40.0
mean,2025-01-05 12:00:00,9.925,597.2665
min,2025-01-01 00:00:00,1.0,38.39
25%,2025-01-03 00:00:00,4.75,197.6875
50%,2025-01-05 12:00:00,9.5,498.725
75%,2025-01-08 00:00:00,15.25,942.355
max,2025-01-10 00:00:00,19.0,1708.87
std,,5.980256,464.902885


In [55]:
# Single column access
units = df['units']        # Series
units 



0     13
1      1
2      4
3     19
4      7
5     13
6      7
7     15
8      6
9      9
10    17
11    16
12    19
13    18
14    15
15     2
16     1
17     4
18    19
19     3
20     5
21     7
22    18
23    10
24     2
25    10
26     7
27    15
28     1
29    13
30    12
31     5
32     4
33     9
34     4
35    16
36    17
37    10
38     6
39    18
Name: units, dtype: int64

In [56]:
# Row selection
row_3 = df.iloc[3]      # by integer position
row_3

date       2025-01-01 00:00:00
store                        B
product                 Gadget
units                       19
revenue                 938.27
Name: 3, dtype: object

In [57]:
# Slicing
first_three = df.iloc[:3, :4]     # first 3 rows, first 4 cols
first_three 


Unnamed: 0,date,store,product,units
0,2025-01-01,A,Widget,13
1,2025-01-01,A,Gadget,1
2,2025-01-01,B,Widget,4


In [58]:
# Detect missing
print(df.isnull().sum())

date       0
store      0
product    0
units      0
revenue    0
dtype: int64


In [None]:
# THANK YOU 