## CSV Reader

In [1]:
import csv

In [2]:
with open('./people.csv', newline='') as csvfile:
  persons = csv.reader(csvfile, delimiter=',', quotechar='"')
  next(persons)
  for person in persons:
    print(person)

['Xavier Smith', '40', 'Male', 'xavier@lupo.ai']
['Irene Doe', '22', 'Female', 'irene@lupo.ai']
['Mary John', '50', 'Female', 'mary@lupo.ai']


In [3]:
csv.list_dialects()

['excel', 'excel-tab', 'unix']

In [4]:
with open('./people.csv', newline='') as csvfile:
  persons = csv.reader(csvfile, dialect='excel')
  next(persons)
  for person in persons:
    print(person)

['Xavier Smith', '40', 'Male', 'xavier@lupo.ai']
['Irene Doe', '22', 'Female', 'irene@lupo.ai']
['Mary John', '50', 'Female', 'mary@lupo.ai']


## Numpy

In [5]:
import numpy as np

In [6]:
sample_array = np.array([0, 0, 7])
print(type(sample_array))
print(sample_array.size)
print(sample_array.shape)
print(sample_array)

<class 'numpy.ndarray'>
3
(3,)
[0 0 7]


In [7]:
nested_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(type(nested_array))
print(nested_array.size)
print(nested_array.shape)
print(nested_array)

<class 'numpy.ndarray'>
9
(3, 3)
[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [8]:
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [9]:
np.ones((2, 3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [10]:
np.empty((2, 3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [11]:
nested_array.flatten()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [12]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [13]:
np.arange(1, 11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [14]:
np.arange(0, 1, 0.2)

array([0. , 0.2, 0.4, 0.6, 0.8])

### Working with files

In [15]:
badges_saved_np = np.loadtxt('badges-five-numpy.txt')
badges_saved_np

array([[1., 1., 3.],
       [2., 2., 3.],
       [3., 4., 3.],
       [4., 5., 3.],
       [5., 8., 3.]])

In [16]:
# Fails when delimiter is missing
try:
  np.loadtxt('badges-five.txt')
except Exception as e:
  print(str(e))

could not convert string '1,1,3' to float64 at row 0, column 1.


In [17]:
np.loadtxt('badges-five.txt', delimiter=',')

array([[1., 1., 3.],
       [2., 2., 3.],
       [3., 4., 3.],
       [4., 5., 3.],
       [5., 8., 3.]])

In [18]:
# Fails when header is present and not excluded
try:
  np.loadtxt('badges-five-header.txt', delimiter=',')
except Exception as e:
  print(str(e))

could not convert string 'Id' to float64 at row 0, column 1.


In [19]:
np.loadtxt('badges-five-header.txt', delimiter=',', skiprows=1)

array([[1., 1., 3.],
       [2., 2., 3.],
       [3., 4., 3.],
       [4., 5., 3.],
       [5., 8., 3.]])

In [20]:
# Data conversion is supported
np.loadtxt('badges-five-header.txt', delimiter=',', skiprows=1, dtype=str)

array([['1', '1', '3'],
       ['2', '2', '3'],
       ['3', '4', '3'],
       ['4', '5', '3'],
       ['5', '8', '3']], dtype='<U1')

In [21]:
# Column selection is supported
np.loadtxt('badges-five-header.txt', delimiter=',', skiprows=1, dtype=str, usecols=(0, 2))

array([['1', '3'],
       ['2', '3'],
       ['3', '3'],
       ['4', '3'],
       ['5', '3']], dtype='<U1')

In [22]:
# Custom converters can be implemented using simple functions:

def rest_one(value):
  return int(value) - 1

def add_thousand(value):
  return int(value) + 1000

np.loadtxt('badges-five-header.txt', delimiter=',', skiprows=1, dtype=int, converters={0: rest_one, 1: add_thousand})

array([[   0, 1001,    3],
       [   1, 1002,    3],
       [   2, 1004,    3],
       [   3, 1005,    3],
       [   4, 1008,    3]])

In [23]:
# Fails when values are missing
try:
  np.loadtxt('badges-five-missing-value.txt', delimiter=',', skiprows=1)
except Exception as e:
  print(str(e))

could not convert string '' to float64 at row 3, column 3.


In [24]:
# Load missing values as 'nan'
np.genfromtxt('badges-five-missing-value.txt', delimiter=',')

array([[nan, nan, nan],
       [ 1.,  1.,  3.],
       [ 2.,  2.,  3.],
       [ 3.,  4.,  3.],
       [ 4.,  5., nan],
       [ 5.,  8.,  3.]])

In [25]:
# We can skip the header
np.genfromtxt('badges-five-missing-value.txt', delimiter=',', skip_header=True)

array([[ 1.,  1.,  3.],
       [ 2.,  2.,  3.],
       [ 3.,  4.,  3.],
       [ 4.,  5., nan],
       [ 5.,  8.,  3.]])

In [26]:
# We can specify how to fill the missing value
np.genfromtxt('badges-five-missing-value.txt', delimiter=',', skip_header=True, filling_values=0)

array([[1., 1., 3.],
       [2., 2., 3.],
       [3., 4., 3.],
       [4., 5., 0.],
       [5., 8., 3.]])

In [27]:
# Opening a csv file with mixed data
np.genfromtxt('people.csv', delimiter=',', dtype=str)

array([['Name', 'Age', 'Gender', 'Email'],
       ['Xavier Smith', '40', 'Male', 'xavier@lupo.ai'],
       ['Irene Doe', '22', 'Female', 'irene@lupo.ai'],
       ['Mary John', '50', 'Female', 'mary@lupo.ai']], dtype='<U14')

## Pandas

In [28]:
import pandas as pd

In [29]:
posts_csv = pd.read_csv('posts-100.csv')
print(type(posts_csv))

<class 'pandas.core.frame.DataFrame'>


In [30]:
print(posts_csv)

      5  1  2014-05-13T23:58:30.457   9     448  2014-05-14T00:36:31.077  \
0     7  1  2014-05-14T00:11:06.457   4   388.0  2014-05-16T13:45:00.237   
1     9  2  2014-05-14T00:36:31.077   5     NaN  2014-05-14T00:36:31.077   
2    10  2  2014-05-14T00:53:43.273  12     NaN  2014-05-14T00:53:43.273   
3    14  1  2014-05-14T01:25:59.677  21  1243.0  2014-06-20T17:36:05.023   
4    15  1  2014-05-14T01:41:23.110   2   543.0  2014-05-14T01:41:23.110   
..  ... ..                      ...  ..     ...                      ...   
94  120  2  2014-05-17T18:15:11.937   5     NaN  2014-05-17T18:15:11.937   
95  121  2  2014-05-17T18:53:30.123  17     NaN  2014-05-17T18:53:30.123   
96  122  2  2014-05-17T20:56:15.577  10     NaN  2014-05-17T20:56:15.577   
97  123  5  2014-05-17T21:10:41.990   0     NaN  2014-05-20T13:50:21.763   
98  124  4  2014-05-17T21:10:41.990   0     NaN  2014-05-20T13:50:19.543   

   How can I do simple machine learning without hard-coding behavior?  \
0   What open-

In [31]:
print(posts_csv.head())

    5  1  2014-05-13T23:58:30.457   9     448  2014-05-14T00:36:31.077  \
0   7  1  2014-05-14T00:11:06.457   4   388.0  2014-05-16T13:45:00.237   
1   9  2  2014-05-14T00:36:31.077   5     NaN  2014-05-14T00:36:31.077   
2  10  2  2014-05-14T00:53:43.273  12     NaN  2014-05-14T00:53:43.273   
3  14  1  2014-05-14T01:25:59.677  21  1243.0  2014-06-20T17:36:05.023   
4  15  1  2014-05-14T01:41:23.110   2   543.0  2014-05-14T01:41:23.110   

  How can I do simple machine learning without hard-coding behavior?  \
0  What open-source books (or other materials) pr...                   
1                                                NaN                   
2                                                NaN                   
3           Is Data Science the Same as Data Mining?                   
4  What are the advantages and disadvantages of S...                   

           <machine-learning>  1.1  1.2  1.3  2014-05-14T14:40:25.950  
0    <education><open-source>  3.0    4  1.0  2014

In [32]:
print(posts_csv.head(3))

    5  1  2014-05-13T23:58:30.457   9    448  2014-05-14T00:36:31.077  \
0   7  1  2014-05-14T00:11:06.457   4  388.0  2014-05-16T13:45:00.237   
1   9  2  2014-05-14T00:36:31.077   5    NaN  2014-05-14T00:36:31.077   
2  10  2  2014-05-14T00:53:43.273  12    NaN  2014-05-14T00:53:43.273   

  How can I do simple machine learning without hard-coding behavior?  \
0  What open-source books (or other materials) pr...                   
1                                                NaN                   
2                                                NaN                   

         <machine-learning>  1.1  1.2  1.3  2014-05-14T14:40:25.950  
0  <education><open-source>  3.0    4  1.0  2014-05-14T08:40:54.950  
1                       NaN  NaN    0  NaN                      NaN  
2                       NaN  NaN    1  NaN                      NaN  


In [33]:
print(type(posts_csv.values))

<class 'numpy.ndarray'>


In [34]:
print(posts_csv.values)

[[7 1 '2014-05-14T00:11:06.457' ... 4 1.0 '2014-05-14T08:40:54.950']
 [9 2 '2014-05-14T00:36:31.077' ... 0 nan nan]
 [10 2 '2014-05-14T00:53:43.273' ... 1 nan nan]
 ...
 [122 2 '2014-05-17T20:56:15.577' ... 0 nan nan]
 [123 5 '2014-05-17T21:10:41.990' ... 0 nan nan]
 [124 4 '2014-05-17T21:10:41.990' ... 0 nan nan]]


### Working with rows

In [35]:
pd.read_csv('posts-100.csv', nrows=3)

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
1,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
2,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,


In [36]:
pd.read_csv('posts-100.csv', nrows=3, skiprows=3)

Unnamed: 0,10,2,2014-05-14T00:53:43.273,12,Unnamed: 4,2014-05-14T00:53:43.273.1,Unnamed: 6,Unnamed: 7,Unnamed: 8,1,Unnamed: 10,Unnamed: 11
0,14,1,2014-05-14T01:25:59.677,21,1243,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4,1,4.0,
1,15,1,2014-05-14T01:41:23.110,2,543,2014-05-14T01:41:23.110,What are the advantages and disadvantages of S...,<databases>,0,1,,2014-05-14T07:41:49.437
2,16,1,2014-05-14T01:57:56.880,18,322,2014-05-17T16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2,0,,


In [37]:
pd.read_csv('posts-100.csv', skiprows=lambda x: x % 2).head()

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
1,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,
2,16,1,2014-05-14T01:57:56.880,18,322.0,2014-05-17T16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2.0,0,,
3,18,4,2014-05-14T02:49:14.580,0,,2014-05-14T02:49:14.580,,,,0,,
4,20,1,2014-05-14T05:37:46.780,17,311.0,2017-08-29T11:26:37.137,the data on our relational DBMS is getting big...,<nosql><relational-dbms>,5.0,1,1.0,


### Working with columns

In [38]:
pd.read_csv('posts-100.csv', usecols=[0, 8]).head()

Unnamed: 0,5,1.1
0,7,3.0
1,9,
2,10,
3,14,4.0
4,15,0.0


In [39]:
pd.read_csv('posts-100.csv').columns

Index(['5', '1', '2014-05-13T23:58:30.457', '9', '448',
       '2014-05-14T00:36:31.077',
       'How can I do simple machine learning without hard-coding behavior?',
       '<machine-learning>', '1.1', '1.2', '1.3', '2014-05-14T14:40:25.950'],
      dtype='object')

In [40]:
pd.read_csv('posts-100.csv', header=None).columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [41]:
pd.read_csv('posts-100.csv', header=None).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


In [42]:
data = pd.read_csv('posts-100.csv', header=None)
data = data.add_prefix('Col')
data.head()

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


In [43]:
headers = ["Id","PostTypeId","CreationDate","Score","ViewCount","LastActivityDate","Title","Tags","AnswerCount","CommentCount","FavoriteCount","ClosedDate"]
pd.read_csv('posts-100.csv', names=headers).head()

Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


In [44]:
posts_header = pd.read_csv('posts-100-header.csv')
posts_header.columns

Index(['Id', 'PostTypeId', 'CreationDate', 'Score', 'ViewCount',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'FavoriteCount', 'ClosedDate'],
      dtype='object')

In [45]:
posts_header['AnswerCount'].head()

0    1.0
1    3.0
2    NaN
3    NaN
4    4.0
Name: AnswerCount, dtype: float64

In [46]:
posts_header[['Id', "PostTypeId"]].head()

Unnamed: 0,Id,PostTypeId
0,5,1
1,7,1
2,9,2
3,10,2
4,14,1


In [47]:
posts_header.head()

Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


In [48]:
# Headers can be removed:
pd.read_csv('posts-100-header.csv', skiprows=1).head()

Unnamed: 0,5,1,2014-05-13T23:58:30.457,9,448,2014-05-14T00:36:31.077,How can I do simple machine learning without hard-coding behavior?,<machine-learning>,1.1,1.2,1.3,2014-05-14T14:40:25.950
0,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
1,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
2,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
3,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,
4,15,1,2014-05-14T01:41:23.110,2,543.0,2014-05-14T01:41:23.110,What are the advantages and disadvantages of S...,<databases>,0.0,1,,2014-05-14T07:41:49.437


In [49]:
pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7]).head()

Unnamed: 0,Id,PostTypeId,CreationDate,Tags
0,5,1,2014-05-13T23:58:30.457,<machine-learning>
1,7,1,2014-05-14T00:11:06.457,<education><open-source>
2,9,2,2014-05-14T00:36:31.077,
3,10,2,2014-05-14T00:53:43.273,
4,14,1,2014-05-14T01:25:59.677,<data-mining><definitions>


### Manipulating data types

In [50]:
pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7]).dtypes

Id               int64
PostTypeId       int64
CreationDate    object
Tags            object
dtype: object

In [51]:
pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7], dtype={'PostTypeId': str}).dtypes

Id               int64
PostTypeId      object
CreationDate    object
Tags            object
dtype: object

In [52]:
pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7], dtype={'PostTypeId': float}).dtypes

Id                int64
PostTypeId      float64
CreationDate     object
Tags             object
dtype: object

In [53]:
import re
tag_to_list_converter = {'Tags': lambda x: re.findall('<[A-Za-z0-9_-]*>', x)}
pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7], converters=tag_to_list_converter).head()

Unnamed: 0,Id,PostTypeId,CreationDate,Tags
0,5,1,2014-05-13T23:58:30.457,[<machine-learning>]
1,7,1,2014-05-14T00:11:06.457,"[<education>, <open-source>]"
2,9,2,2014-05-14T00:36:31.077,[]
3,10,2,2014-05-14T00:53:43.273,[]
4,14,1,2014-05-14T01:25:59.677,"[<data-mining>, <definitions>]"


In [54]:
posts_date = pd.read_csv('posts-100-header.csv', usecols=[0, 1, 2, 7], parse_dates=['CreationDate'])
print(type(posts_date['CreationDate'][0]))
posts_date.head()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,Id,PostTypeId,CreationDate,Tags
0,5,1,2014-05-13 23:58:30.457,<machine-learning>
1,7,1,2014-05-14 00:11:06.457,<education><open-source>
2,9,2,2014-05-14 00:36:31.077,
3,10,2,2014-05-14 00:53:43.273,
4,14,1,2014-05-14 01:25:59.677,<data-mining><definitions>


In [55]:
pd.read_csv('posts-100-header.csv', usecols=[0, 3, 4, 8, 9, 10]).head()

Unnamed: 0,Id,Score,ViewCount,AnswerCount,CommentCount,FavoriteCount
0,5,9,448.0,1.0,1,1.0
1,7,4,388.0,3.0,4,1.0
2,9,5,,,0,
3,10,12,,,1,
4,14,21,1243.0,4.0,1,4.0


In [56]:
pd.read_csv('posts-100-header.csv', usecols=[0, 3, 4, 8, 9, 10], na_filter=False).head()

Unnamed: 0,Id,Score,ViewCount,AnswerCount,CommentCount,FavoriteCount
0,5,9,448.0,1.0,1,1.0
1,7,4,388.0,3.0,4,1.0
2,9,5,,,0,
3,10,12,,,1,
4,14,21,1243.0,4.0,1,4.0


In [57]:
pd.read_csv('posts-100-header.csv', usecols=[0, 3, 4, 8, 9, 10], na_filter=True).head()

Unnamed: 0,Id,Score,ViewCount,AnswerCount,CommentCount,FavoriteCount
0,5,9,448.0,1.0,1,1.0
1,7,4,388.0,3.0,4,1.0
2,9,5,,,0,
3,10,12,,,1,
4,14,21,1243.0,4.0,1,4.0


### Tabular files

In [58]:
try:
  pd.read_csv('posts-100.tsv', header=None)
except Exception as e:
  print(e)

Error tokenizing data. C error: Expected 1 fields in line 11, saw 2



In [59]:
pd.read_csv('posts-100.tsv', header=None, sep='\t').head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


In [60]:
pd.read_table('posts-100.tsv', header=None).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,


### Remote files

In [61]:
pd.read_csv('https://raw.githubusercontent.com/xmorera/sample-data/main/posts-100.csv', header=None).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,5,1,2014-05-13T23:58:30.457,9,448.0,2014-05-14T00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1.0,1,1.0,2014-05-14T14:40:25.950
1,7,1,2014-05-14T00:11:06.457,4,388.0,2014-05-16T13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3.0,4,1.0,2014-05-14T08:40:54.950
2,9,2,2014-05-14T00:36:31.077,5,,2014-05-14T00:36:31.077,,,,0,,
3,10,2,2014-05-14T00:53:43.273,12,,2014-05-14T00:53:43.273,,,,1,,
4,14,1,2014-05-14T01:25:59.677,21,1243.0,2014-06-20T17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4.0,1,4.0,
