## Loading Data into a DataFrame
The biggest challenge that face people trying to become data scientist is getting data to use. That's why we will use Kaggle (https://www.kaggle.com/) to access free data. In most cases data we will be analyzing will be a tabular data, meaning stored as text in table structure with multiple rows and each row contains information about something and should have the same number of cells. tabular data can be comma separated value file format or a CSV file containing data in a table structure.

#### Loading a Tabular Data from a CSV file into a DataFrame

1. Read data as they are in the CSV file

In [9]:
import pandas as pd;
import os;

In [10]:
# show dataframe
df = pd.read_csv('../../titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
4,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
891,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
892,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
893,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
894,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [11]:
# To get just the first 10 rows
df = pd.read_csv('../../titanic.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
3,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
4,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
5,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
7,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
8,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
9,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


2. Adjusting the Loading Parameters

- Columns or Header

In [12]:
# If the CSV file first row is the header, just add header=0 to confirm and tell dataFrame to use first row as the header
df = pd.read_csv('../../titanic.csv', header=0)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
4,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
891,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
892,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
893,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
894,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [13]:
# If the CSV file doen't include a header row, you can add header=None to tell dataFrame to create header with values starting from 0 and stepping by 1
df = pd.read_csv('../../titanic.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
3,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
4,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
...,...,...,...,...,...,...,...,...,...,...,...,...
892,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
893,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
894,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
895,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C


In [14]:
# To create header row and assign to it as names
col_names = ['PassengerId','Survived','Pclass,Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
df = pd.read_csv('../../titanic.csv', header=None, names=col_names)
df

Unnamed: 0,PassengerId,Survived,"Pclass,Name",Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C


In [15]:
# To overwrite the existing header row and assign to it new header
col_names = ['col-1','col-2','col-3','col-4','col-5','col-6','col-7','col-8','col-9','col-10','col-11']
df = pd.read_csv('../../titanic.csv', header=0, names=col_names)
df

Unnamed: 0,col-1,col-2,col-3,col-4,col-5,col-6,col-7,col-8,col-9,col-10,col-11
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [16]:
# Reduce or specify colums that you want to use, usecols=[0, 2, 3, ...]
df = pd.read_csv('../../titanic.csv', header=0, usecols=[0, 1, 2, 3, 4])
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex
0,1,0,3,"Braund, Mr. Owen Harris",male
1,1,0,3,"Braund, Mr. Owen Harris",male
2,1,0,3,"Braund, Mr. Owen Harris",male
3,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
4,3,1,3,"Heikkinen, Miss. Laina",female
...,...,...,...,...,...
891,887,0,2,"Montvila, Rev. Juozas",male
892,888,1,1,"Graham, Miss. Margaret Edith",female
893,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female
894,890,1,1,"Behr, Mr. Karl Howell",male


- Columns Data Types

Most of time the datatype from csv file are not 100% accurate. So you may need to change or modify them. Survived column for example is showing as int64 datatype. which not true because it has a binary value, 0 or 1 but the system is reading as an integer. Another thing is object datatype is not just a regular object but strings as well are identified as objects. But if a column that is suppose to be an integer or boolean datatype is identify as object, we need to manually modify it. More info or docs about loading data input, ouput and other parameters can be accesses here (https://pandas.pydata.org/docs/reference/io.html)

In [17]:
# To get datatype for each column
df = pd.read_csv('../../titanic.csv')
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [18]:
# Overwrite datatype by passing dtype = {}, a dictionary with a key value, key being the columns name and value being the datatype your giving it.
df = pd.read_csv('../../titanic.csv', header=0, dtype={'Survived': 'bool'})
# df (If you run this, now the column Survived will have a true false value)
df.dtypes


PassengerId      int64
Survived          bool
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
# 
df = pd.read_csv('../../titanic.csv', header=0, dtype={'Survived': 'bool'})
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
4,3,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
891,887,False,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
892,888,True,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
893,889,False,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
894,890,True,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


- Rows or Index(es)

In [20]:
# To get the list of index
df = pd.read_csv('../../titanic.csv')
df.index

RangeIndex(start=0, stop=896, step=1)

In [21]:
# To use a specific column as index, add index_col=X
df = pd.read_csv('../../titanic.csv', header=0, index_col= 3)
df

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...
"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C
