# Importing pandas

In [4]:
import pandas as pd

# Opening a local csv file


In [5]:
df = pd.read_csv('Netflix_stock_data.csv')

In [6]:
df

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2002-05-23,1.196429,1.242857,1.145714,1.156429,104790000
1,2002-05-24,1.210000,1.225000,1.197143,1.214286,11104800
2,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
3,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
4,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200
...,...,...,...,...,...,...
5805,2025-06-18,1222.290039,1242.000000,1220.500000,1229.989990,2281000
5806,2025-06-20,1231.410034,1248.500000,1224.349976,1234.449951,5348200
5807,2025-06-23,1253.540039,1254.839966,1215.010010,1238.030029,2667300
5808,2025-06-24,1279.109985,1282.569946,1255.000000,1260.550049,2663300


# Opening a csv file from an URL

In [7]:
import requests
from io import StringIO

url = "https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv"

req = requests.get(url)


# Convert text response to a file-like object
data = StringIO(req.text)
df = pd.read_csv(data)
df

Unnamed: 0,Month,"""1958""","""1959""","""1960"""
0,JAN,340,360,417
1,FEB,318,342,391
2,MAR,362,406,419
3,APR,348,396,461
4,MAY,363,420,472
5,JUN,435,472,535
6,JUL,491,548,622
7,AUG,505,559,606
8,SEP,404,463,508
9,OCT,359,407,461


# Sep Parameter
# sep stands for separator.
# It tells Python what symbol is used to separate values (columns) in a text file like CSV or TSV.

In [8]:
pd.read_csv('hw_200.tsv',sep='\t')

Unnamed: 0,Index,Height,Weight
0,1,65.78,112.99
1,2,71.52,136.49
2,3,69.4,153.03
3,4,68.22,142.34
4,5,67.79,144.3


# Index_col parameter
# index_col tells pandas which column to use as the row index (the row labels) when loading a file like a CSV or TSV.

In [9]:
pd.read_csv('employees.csv',index_col='EID')

Unnamed: 0_level_0,Name,Department,Salary
EID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,Amit,HR,35000
102,Priya,IT,45000
103,Rahul,Finance,40000
104,Sneha,Marketing,38000
105,Karan,Sales,42000


# Header Parameter
# The header parameter tells pandas which row contains the column names when reading a CSV or TSV file.

In [10]:
pd.read_csv('Netflix_stock_data.csv',header=1,names=['Date','Close','High','Low','Open','Volume'])

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2002-05-24,1.210000,1.225000,1.197143,1.214286,11104800
1,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
2,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
3,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200
4,2002-05-31,1.076429,1.078571,1.071429,1.078571,8464400
...,...,...,...,...,...,...
5804,2025-06-18,1222.290039,1242.000000,1220.500000,1229.989990,2281000
5805,2025-06-20,1231.410034,1248.500000,1224.349976,1234.449951,5348200
5806,2025-06-23,1253.540039,1254.839966,1215.010010,1238.030029,2667300
5807,2025-06-24,1279.109985,1282.569946,1255.000000,1260.550049,2663300


# usecols parameter
# The usecols parameter lets you select only specific columns to load from a CSV or TSV file.

In [11]:
pd.read_csv('Netflix_stock_data.csv',usecols=['Date'])

Unnamed: 0,Date
0,2002-05-23
1,2002-05-24
2,2002-05-28
3,2002-05-29
4,2002-05-30
...,...
5805,2025-06-18
5806,2025-06-20
5807,2025-06-23
5808,2025-06-24


# Squeeze parameters  , it is a series object

# Skiprows/nrows Parameter
# skiprows tells pandas to skip certain rows at the top of your file before reading the actual data
# The nrows parameter tells pandas to read only a specific number of rows from the file.



In [12]:
pd.read_csv('Netflix_stock_data.csv',skiprows=[0,1])

Unnamed: 0,2002-05-24,1.2100000381469727,1.225000023841858,1.1971429586410522,1.214285969734192,11104800
0,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
1,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
2,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200
3,2002-05-31,1.076429,1.078571,1.071429,1.078571,8464400
4,2002-06-03,1.128571,1.149286,1.076429,1.080000,3151400
...,...,...,...,...,...,...
5803,2025-06-18,1222.290039,1242.000000,1220.500000,1229.989990,2281000
5804,2025-06-20,1231.410034,1248.500000,1224.349976,1234.449951,5348200
5805,2025-06-23,1253.540039,1254.839966,1215.010010,1238.030029,2667300
5806,2025-06-24,1279.109985,1282.569946,1255.000000,1260.550049,2663300


In [13]:
pd.read_csv('Netflix_stock_data.csv',nrows=100)

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2002-05-23,1.196429,1.242857,1.145714,1.156429,104790000
1,2002-05-24,1.210000,1.225000,1.197143,1.214286,11104800
2,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
3,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
4,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200
...,...,...,...,...,...,...
95,2002-10-08,0.405000,0.427857,0.390714,0.410714,4531800
96,2002-10-09,0.372857,0.410714,0.347143,0.410000,5982200
97,2002-10-10,0.506429,0.525000,0.346429,0.377857,13577200
98,2002-10-11,0.470714,0.528571,0.446429,0.508571,13281800


# Encoding parameter
# The encoding parameter tells pandas how to read special characters (like ₹, é, ü, ç, etc.) from a file correctly.

In [14]:
pd.read_csv('Netflix_stock_data.csv',encoding='utf-8')

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2002-05-23,1.196429,1.242857,1.145714,1.156429,104790000
1,2002-05-24,1.210000,1.225000,1.197143,1.214286,11104800
2,2002-05-28,1.157143,1.232143,1.157143,1.213571,6609400
3,2002-05-29,1.103571,1.164286,1.085714,1.164286,6757800
4,2002-05-30,1.071429,1.107857,1.071429,1.107857,10154200
...,...,...,...,...,...,...
5805,2025-06-18,1222.290039,1242.000000,1220.500000,1229.989990,2281000
5806,2025-06-20,1231.410034,1248.500000,1224.349976,1234.449951,5348200
5807,2025-06-23,1253.540039,1254.839966,1215.010010,1238.030029,2667300
5808,2025-06-24,1279.109985,1282.569946,1255.000000,1260.550049,2663300


# Skip bad lines
# skip_bad_lines=True tells pandas to ignore and skip rows that cause errors while reading a CSV file.

In [15]:
pd.read_csv("messy_employees.csv", on_bad_lines='skip')


Unnamed: 0,EID,Name,Department,Salary
0,101,Amit,HR,40000.0
1,102,Neha,IT,45000.0
2,103,Ravi,Finance,42000.0
3,104,Sneha,Marketing,
4,105,Karan,Sales,41000.0


# dtypes parameter
# Override behavior means changing or replacing the default action of something.

In [30]:
pd.read_csv('Netflix_stock_data.csv',dtype={'Volume':str}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5810 entries, 0 to 5809
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    5810 non-null   object 
 1   Close   5810 non-null   float64
 2   High    5810 non-null   float64
 3   Low     5810 non-null   float64
 4   Open    5810 non-null   float64
 5   Volume  5810 non-null   object 
dtypes: float64(4), object(2)
memory usage: 272.5+ KB


# Handling Dates

In [39]:
pd.read_csv('Netflix_stock_data.csv',parse_dates=['Date']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5810 entries, 0 to 5809
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    5810 non-null   datetime64[ns]
 1   Close   5810 non-null   float64       
 2   High    5810 non-null   float64       
 3   Low     5810 non-null   float64       
 4   Open    5810 non-null   float64       
 5   Volume  5810 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 272.5 KB


# *Convertors

# converters is used in pandas.read_csv() to custom-process values in a column while reading the file.
# like if there is a ipl records and teams has 1 row which name is Royal and we want to make it RCB



# *na_values parameter

# Treat these as NaN (Not a Number) or empty cells.



# *Loading huge dataset in chunks

# When reading large CSV files, chunksize lets you load the data in smaller parts (chunks) instead of all at once.

In [64]:
dfs = pd.read_csv('Netflix_stock_data.csv',chunksize=2000)

for chunk in dfs:
    print(chunk.shape)


(2000, 6)
(2000, 6)
(1810, 6)
