# Pandas data import
Ultimate guide to import data from various data sources into Pandas dataframe 

In [14]:
# import libraries
import pandas as pd 
import numpy as np
from google.cloud import bigquery

ModuleNotFoundError: No module named 'google.api_core'

In [3]:
# data sources link
csv_current_dir = 'train.csv'
csv_another_dir = './Dataset/train.csv'
excel_another_dir = './Dataset/excel_data.xlsx'
feather_another_dir = './Dataset/feather_data.feather'
textfile_another_dir = './Dataset/textFile.txt'
csv_from_url = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"

In [4]:
# big query sql
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 5
"""

In [5]:
# reading data from current directory
df_import_current_dir = pd.read_csv(csv_current_dir)
df_import_current_dir.head()

Unnamed: 0,X1,X2,X3,TARGET
0,0.991066,1.462883,0.034695,10.543255
1,0.925121,-0.335997,0.261957,11.612457
2,0.50171,0.794581,0.445746,9.252304
3,0.933814,-0.580261,0.337145,12.352551
4,0.129216,-0.281915,0.036227,1.568255


In [6]:
# reading data from another directory
df_import_another_dir = pd.read_csv(csv_another_dir)
df_import_another_dir.head()

Unnamed: 0,X1,X2,X3,TARGET
0,0.991066,1.462883,0.034695,10.543255
1,0.925121,-0.335997,0.261957,11.612457
2,0.50171,0.794581,0.445746,9.252304
3,0.933814,-0.580261,0.337145,12.352551
4,0.129216,-0.281915,0.036227,1.568255


In [7]:
# reading data from url
df_import_url = pd.read_csv(csv_from_url)
df_import_url.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


In [8]:
# copy the dataframe above and run this
df_clipboard = pd.read_clipboard()
df_clipboard.head()

Unnamed: 0,C:\Users\rahul\.conda\envs\medium


In [9]:
# read data from excel file
# please make sure that xlrd package is installed
# if xlrd > 2.0.1 then also install openpyxl package
df_read_excel = pd.read_excel(excel_another_dir)
df_read_excel.head()

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4
3,4,5
4,5,6


In [10]:
# importing feather data
# feather data is a binary file format
# very high performance of reading and writing compared to read or writing csv
# feather data works for both R and Python

# first let's generate data

arr = np.random.randn(200, 4) # array with 800 elements
columnnames = ['col1','col2','col3','col4']
df = pd.DataFrame(arr, columns=columnnames)

# export data into dataset folder in feather format
df.to_feather(feather_another_dir)

In [11]:
# import feather data into pandas dataframe
df_feather = pd.read_feather(feather_another_dir)
df_feather.head()

Unnamed: 0,col1,col2,col3,col4
0,-0.187477,0.291897,2.165954,-1.096965
1,1.523016,-0.462393,-0.031911,-0.072436
2,-1.669419,-1.315843,-1.77077,-1.379006
3,-1.079632,0.072103,0.62335,-0.196495
4,-0.839816,0.539958,0.323279,1.138621


In [12]:
# Reading a table of fixed-width formatted lines into Pandas
df_textfile = pd.read_fwf(textfile_another_dir,skiprows=[0,1,2], skipfooter = 2, widths=[8,12,19])
df_textfile

Unnamed: 0,name,title,salary
0,Rahul,Side kick,-100
1,Ironman,Rich Guy,1000000000000000000
2,Captain,Respect Guy,100000000000000
3,Hulk,Smash bro,1000000000000000


In [14]:
# importing data from big query
# Returns data in pandas dataframe format
df_gcp = bigquery.Client().query(query).to_dataframe();
df_gcp.head()

ValueError: The pyarrow library is not installed, please install pyarrow to use the to_arrow() function.