# Pandas Tips: `read_csv()`

In [2]:
import pandas as pd

Check your pandas version to ensure similar behavior. 

_Version 2.1.1 was released September 2023._

In [3]:
pd.__version__

'2.1.1'

## Basics

### Reading from computer file

In [20]:
!ls

customers.csv  dataset_gen.py read_csv.ipynb


In [4]:
df = pd.read_csv('customers.csv')

In [5]:
df.head()

Unnamed: 0,ID,First Name,Last Name,Phone,LTV
0,4576,Sophia,Walker,602-310-9331,$496.87
1,9488,Evelyn,Harris,171-363-6978,"$38,424.41"
2,7265,Lucas,Foster,139-658-8905,"$14,413.64"
3,8555,Elijah,Jones,538-241-9868,"$13,440.49"
4,7023,Mason,Taylor,937-595-5837,"$1,777.37"


### Reading from a URL

In [9]:
df_url = pd.read_csv('https://raw.githubusercontent.com/kimfetti/Videos/master/Pandas_Tips/data/customers.csv')

In [10]:
df_url.head()

Unnamed: 0,ID,First Name,Last Name,Phone,LTV
0,4576,Sophia,Walker,602-310-9331,$496.87
1,9488,Evelyn,Harris,171-363-6978,"$38,424.41"
2,7265,Lucas,Foster,139-658-8905,"$14,413.64"
3,8555,Elijah,Jones,538-241-9868,"$13,440.49"
4,7023,Mason,Taylor,937-595-5837,"$1,777.37"


## $\star$ Level Up! $\star$

### Index by a column: `index_col`

In [11]:
df = pd.read_csv(
    'customers.csv',
    index_col = 'ID' 
)

In [12]:
df.head()

Unnamed: 0_level_0,First Name,Last Name,Phone,LTV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4576,Sophia,Walker,602-310-9331,$496.87
9488,Evelyn,Harris,171-363-6978,"$38,424.41"
7265,Lucas,Foster,139-658-8905,"$14,413.64"
8555,Elijah,Jones,538-241-9868,"$13,440.49"
7023,Mason,Taylor,937-595-5837,"$1,777.37"


### Specify missing value characters: `na_values`

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4576 to 9062
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  1000 non-null   object
 1   Last Name   1000 non-null   object
 2   Phone       968 non-null    object
 3   LTV         986 non-null    object
dtypes: object(4)
memory usage: 39.1+ KB


In [14]:
df.Phone.value_counts()

Phone
?               31
602-310-9331     1
411-437-8722     1
563-866-1233     1
406-143-1943     1
                ..
384-871-5619     1
474-968-5565     1
244-703-2661     1
209-882-7213     1
412-368-2550     1
Name: count, Length: 938, dtype: int64

In [15]:
df.LTV.value_counts()

LTV
?              12
$496.87         1
$4,031.13       1
$2,158.26       1
$7,342.30       1
               ..
$901.23         1
$8,279.17       1
$907.24         1
$1,326.67       1
$14,686.60      1
Name: count, Length: 975, dtype: int64

In [32]:
df = pd.read_csv(
    'customers.csv',
    index_col = 'ID', #index by ID column
    na_values = '?'
)

In [33]:
df.head()

Unnamed: 0_level_0,First Name,Last Name,Phone,LTV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4576,Sophia,Walker,602-310-9331,$496.87
9488,Evelyn,Harris,171-363-6978,"$38,424.41"
7265,Lucas,Foster,139-658-8905,"$14,413.64"
8555,Elijah,Jones,538-241-9868,"$13,440.49"
7023,Mason,Taylor,937-595-5837,"$1,777.37"


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4576 to 9062
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  1000 non-null   object
 1   Last Name   1000 non-null   object
 2   Phone       937 non-null    object
 3   LTV         974 non-null    object
dtypes: object(4)
memory usage: 39.1+ KB


In [19]:
df.Phone.value_counts()

Phone
602-310-9331    1
681-285-1089    1
563-866-1233    1
406-143-1943    1
519-110-2218    1
               ..
384-871-5619    1
474-968-5565    1
244-703-2661    1
209-882-7213    1
412-368-2550    1
Name: count, Length: 937, dtype: int64

### Limiting the number of rows

In [27]:
df.shape

(1000, 4)

In [28]:
df.tail()

Unnamed: 0_level_0,First Name,Last Name,Phone,LTV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7099,David,Harris,809-428-6034,"$12,799.04"
6743,Henry,Reyes,999-363-7716,"$11,043.94"
3513,Olivia,Sanders,,"$10,206.00"
4307,Emma,Cox,748-415-4613,"$9,839.57"
9062,Henry,Brown,412-368-2550,"$14,686.60"


In [29]:
df = pd.read_csv(
    'customers.csv',
    index_col = 'ID', #index by ID column
    na_values = '?', #treat ? characters as missings
    nrows = 100
)

In [30]:
df.shape

(100, 4)

In [31]:
df.tail()

Unnamed: 0_level_0,First Name,Last Name,Phone,LTV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2229,Emma,Simmons,613-834-3513,"$4,538.32"
5039,Ava,Brown,170-576-6081,$422.52
2263,Victoria,Smith,971-297-2895,"$5,829.51"
9232,Lucas,Perry,534-290-4258,"$7,243.49"
8922,Joseph,Mitchell,,"$3,721.42"
