## Reading CSV files

In [2]:
import pandas as pd

df = pd.read_csv("datasets/data_01.csv")

In [3]:
df

Unnamed: 0,first name,last name,experience,score
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [4]:
df2 = pd.read_csv('datasets/data_02.csv')
df2

Unnamed: 0,Alex,Smith,Python,92
0,James,Jones,C++,97
1,Mary,Williams,Python,98
2,Patricia,Brown,Java,87
3,Jennifer,Taylor,HTML,88
4,Emily,Davies,Android,90
5,Michael,Wilson,Python,91
6,Jessica,Evans,C++,95
7,John,Thomas,Python,90
8,Thomas,Roberts,Java,89
9,Christopher,Martin,Python,83


if the headers is not indicated in the csv file

In [5]:
# Telling pandas not to take the first line as the header
df2 = pd.read_csv('datasets/data_02.csv', header = None)
df2

Unnamed: 0,0,1,2,3
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [6]:
# Changing the names of the headers
df2 = pd.read_csv('datasets/data_02.csv', header = None, names = ["fname", "lname", "course", "score"])
df2

Unnamed: 0,fname,lname,course,score
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [7]:
# Changing the header names of the dataset that already have the headers
df = pd.read_csv("datasets/data_01.csv", skiprows=1, names = ['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [8]:
df3 = pd.read_csv('datasets/data_03.csv')
df3

Unnamed: 0,first name| last name| experience| score
0,Alex | Smith| Python | 92
1,James | Jones| C++ | 97
2,Mary | Williams| Python | 98
3,Patricia | Brown| Java | 87
4,Jennifer | Taylor| HTML | 88
5,Emily | Davies| Android | 90
6,Michael | Wilson| Python | 91
7,Jessica | Evans| C++ | 95
8,John | Thomas| Python | 90
9,Thomas | Roberts| Java | 89


In [9]:
# the above dataframe has only one column this is due to the seperate 
# character being |
# checking the length of the colums
len(df3.columns)

1

In [10]:
# to overcome the above, we use the seperater character
df3 = pd.read_csv('datasets/data_03.csv', sep = '|')
df3

Unnamed: 0,first name,last name,experience,score
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [12]:
# Since pandas takes the first line as the header and the rest as records, even
# the commented lines are treated the same hence the problem
df4 = pd.read_csv('datasets/data_04.csv')
df4

Unnamed: 0,# this is the dataset description
0,# these lines are not part of dataset and
1,# sould be ignored when reading the dataset
2,first name\t last name\t experience\t score
3,Alex \t Smith\t Python \t 92
4,James \t Jones\t C++ \t 97
5,Mary \t Williams\t Python \t 98
6,Patricia \t Brown\t Java \t 87
7,Jennifer \t Taylor\t HTML \t 88
8,Emily \t Davies\t Android \t 90
9,Michael \t Wilson\t Python \t 91


In [13]:
# to overcome the above problem we use comment = '#' include the seperater as \t
df4 = pd.read_csv('datasets/data_04.csv', comment = '#', sep = '\t')
df4

Unnamed: 0,first name,last name,experience,score
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89


In [14]:
# Pandas teads the first three lines and assumes there is only one colum 
# until it reaches line four where the record is seprated by commas hence the 
# error
df5 = pd.read_csv("datasets/data_05.csv")
df5

ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 4


In [15]:
# Since the first three lines have no comment character, we use the skiprows
# to skip the first three rows for description 

df5 = pd.read_csv("datasets/data_05.csv", skiprows=3)
df5

Unnamed: 0,first name,last name,experience,score
0,Alex,Smith,Python,92
1,James,Jones,C++,97
2,Mary,Williams,Python,98
3,Patricia,Brown,Java,87
4,Jennifer,Taylor,HTML,88
5,Emily,Davies,Android,90
6,Michael,Wilson,Python,91
7,Jessica,Evans,C++,95
8,John,Thomas,Python,90
9,Thomas,Roberts,Java,89
