### Step1: 导入必要的库

In [1]:
import sys
import pandas as pd #数据分析
import numpy as np # 矩阵计算

## 数据读取

- 文件格式: 大文件 chunk方式读取
- 数据库：sqlite MySQL MongoDB 等
- 数据采样

### Q007: 读取数据时指定列的数据类型

默认情况pandas 会推断数据的每列类型。

In [37]:
# 将所有字段都设置为 字符串，拒绝 pandas 的类型推断
data = pd.read_csv('../input/titanic.csv', dtype= str)

In [38]:
# 查看各个列的类型
data.dtypes

PassengerId    object
Survived       object
Pclass         object
Name           object
Sex            object
Age            object
SibSp          object
Parch          object
Ticket         object
Fare           object
Cabin          object
Embarked       object
dtype: object

In [42]:
print(sys.getsizeof(data)/1024,"KB")

638.421875 KB


In [40]:
sys.getsizeof??

也可以通过 字典dict 去指定各列的类型。 

In [43]:
dtype = {'Pclass':int,'Fare':float,'Survived':str}

In [44]:
data = pd.read_csv('../input/titanic.csv', dtype= dtypes)

In [45]:
# 查看各个列的类型
data.dtypes

PassengerId      int64
Survived        object
Pclass           int32
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [46]:
print(sys.getsizeof(data)/1024,"KB")

362.044921875 KB


### Q008: 超大数据文件读取 

对于超大文件的处理往往受限于内存，我们没办法一次读取。通常采取循环遍历的方式， 依次读取数据的固定行，直至读完。

In [8]:
# 设置每次读取的行数
chunksize = 10

In [29]:
data_iter = pd.read_csv('../input/titanic.csv', chunksize =chunksize , error_bad_lines = False)

In [11]:
data_iter

<pandas.io.parsers.TextFileReader at 0x28341855f60>

设置迭代参数 loop\ chunksize\ chunks \count 将每块数据存在list中

In [30]:
chunks = []
for df in data_iter:
    # 处理df
    chunks.append(df)

In [13]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


循环数据读取数据,(捕捉迭代错误)

合并chunks数据

In [31]:
data = pd.concat(chunks,ignore_index = True)

In [32]:
data.shape

(891, 12)


**以上两种都是节省 pandas 的数据内存占用的常用方式。除此以外，还可以通过 `ussecols =  需要保留列`参数在读取时，只选取需要的列，从源头降低内存占用**

## 2.2. 数据库操作

sqlite 是一款轻型的嵌入式数据库。无需安装数据库软件，可直接使用。 **初学者，用它练习 SQL 是一个不错的选择。** 

In [2]:
import sqlite3

In [68]:
# 数据库名称复制为特定的名称 :memory:，这样就会在 RAM 中创建一个数据库
# conn = sqlite3.connect(':memory:')
conn = sqlite3.connect('../input/test.db')
c = conn.cursor()

### pandas方式

In [3]:
# pandas 写入
data.to_sql('titanic', conn)

In [65]:
# pandas 读取 
# 注： SQL 中不区分大小写
df = pd.read_sql_query("SELECT * FROM TITANIC", conn)
df = pd.read_sql_query("SELECT PassengerId,Survived,Pclass FROM TITANIC WHERE PassengerId >30 ", conn)

df 

Unnamed: 0,PassengerId,Survived,Pclass
0,31,0,1
1,32,1,1
2,33,1,3
3,34,0,2
4,35,0,1
...,...,...,...
856,887,0,2
857,888,1,1
858,889,0,3
859,890,1,1


## SQL 进阶 - 增删改查

### 2.2.1 sqlite 创建表

In [50]:
c.execute('''CREATE TABLE IF NOT EXISTS COMPANY
       (ID INT PRIMARY KEY     NOT NULL,
       NAME           TEXT    NOT NULL,
       AGE            INT     NOT NULL,
       ADDRESS        CHAR(50),
       SALARY         REAL);''')
print("Table created successfully");
# conn.commit()
# conn.close()

Table created successfully


### 2.2.2 插入数据

In [51]:
#insert 操作
c.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (1, 'Paul', 32, 'California', 20000.00 )");

c.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (2, 'Allen', 25, 'Texas', 15000.00 )");

c.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (3, 'Teddy', 23, 'Norway', 20000.00 )");

c.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (4, 'Mark', 25, 'Rich-Mond ', 65000.00 )");

# conn.commit()
print("Records created successfully");
# conn.close()

Records created successfully



### 2.2.3  sqlite 数据读取

In [52]:
#select 操作
cursor = c.execute("SELECT id, name, address, salary  from COMPANY")
for row in cursor:
   print("ID = ", row[0])
   print("NAME = ", row[1])
   print("ADDRESS = ", row[2])
   print("SALARY = ", row[3], "\n")

print("Operation done successfully");
# conn.close()

ID =  1
NAME =  Paul
ADDRESS =  California
SALARY =  20000.0 

ID =  2
NAME =  Allen
ADDRESS =  Texas
SALARY =  15000.0 

ID =  3
NAME =  Teddy
ADDRESS =  Norway
SALARY =  20000.0 

ID =  4
NAME =  Mark
ADDRESS =  Rich-Mond 
SALARY =  65000.0 

Operation done successfully


### 2.2.4 修改数据

In [56]:
c.execute("UPDATE COMPANY set SALARY = 25000.00 where ID=1")
conn.commit()
print("Total number of rows updated :", conn.total_changes)

cursor = conn.execute("SELECT id, name, address, salary  from COMPANY")
for row in cursor:
   print("ID = ", row[0])
   print("NAME = ", row[1])
   print("ADDRESS = ", row[2])
   print("SALARY = ", row[3], "\n")

print("Operation done successfully");

Total number of rows updated : 896
ID =  1
NAME =  Paul
ADDRESS =  California
SALARY =  25000.0 

ID =  2
NAME =  Allen
ADDRESS =  Texas
SALARY =  15000.0 

ID =  3
NAME =  Teddy
ADDRESS =  Norway
SALARY =  20000.0 

ID =  4
NAME =  Mark
ADDRESS =  Rich-Mond 
SALARY =  65000.0 

Operation done successfully


### 2.2.5 删除数据 

In [57]:
c.execute("DELETE from COMPANY where ID=2;")

print("Total number of rows updated :", conn.total_changes)

cursor = conn.execute("SELECT id, name, address, salary  from COMPANY")
for row in cursor:
   print("ID = ", row[0])
   print("NAME = ", row[1])
   print("ADDRESS = ", row[2])
   print("SALARY = ", row[3], "\n")

print("Operation done successfully");

Total number of rows updated : 897
ID =  1
NAME =  Paul
ADDRESS =  California
SALARY =  25000.0 

ID =  3
NAME =  Teddy
ADDRESS =  Norway
SALARY =  20000.0 

ID =  4
NAME =  Mark
ADDRESS =  Rich-Mond 
SALARY =  65000.0 

Operation done successfully


In [58]:
from sqlalchemy import create_engine

In [59]:
# engine = create_engine('sqlite:///:memory:')
engine = create_engine('sqlite:///test.db')

data.to_sql('data_train',engine,chunksize =1000)


#### 简单的用法

In [60]:
from sqlalchemy import create_engine, MetaData  

In [None]:
engine = create_engine('sqlite:///test.sqlite')  
# metadata = MetaData(bind=engine, reflect=True)  

In [None]:
# 创建表  
engine.execute('CREATE TABLE IF NOT EXISTS user(name VARCHAR, password VARCHAR)')  

In [None]:
# 插入数据  
engine.execute('INSERT INTO user(name, password) VALUES(?, ?)', [('tom', '123'), ('john', '321')])  

In [62]:
# 查询  
for row in engine.execute('SELECT * FROM user'):  
    print(row.name, row['name'], row[0]  )

tom tom tom
john john john
tom tom tom
john john john
