# Hadoop client


# 创建连接


In [35]:
import hdfs
from pprint import pprint

In [36]:
hdfs_client = hdfs.InsecureClient("http://localhost:9870",user="hadoop")

# 目录/列表

## 获取目录操作权限
默认是没有权限操作目录的，需在hadoop shell中执行如下命令
hadoop fs  -chmod 777 /

In [37]:
print(hdfs_client.list("/"))
pprint(hdfs_client.list("/", status=True))
pprint(hdfs_client.status("/"))

[]
[]
{'accessTime': 0,
 'blockSize': 0,
 'childrenNum': 0,
 'fileId': 16385,
 'group': 'supergroup',
 'length': 0,
 'modificationTime': 1701683192308,
 'owner': 'hadoop',
 'pathSuffix': '',
 'permission': '755',
 'replication': 0,
 'snapshotEnabled': True,
 'storagePolicy': 0,
 'type': 'DIRECTORY'}


## 创建目录

In [38]:
hdfs_client.makedirs("/a/b/c", permission=777)
print(hdfs_client.list("/")) 
print(hdfs_client.list("/a")) 
print(hdfs_client.list("/a/b"))  


['a']
['b']
['c']


## 查看目录详情
查看当前目录下有多少个子目录、多少文件等等。

In [39]:
print(hdfs_client.content("/", strict=True))

{'directoryCount': 4, 'ecPolicy': '', 'fileCount': 0, 'length': 0, 'quota': 9223372036854775807, 'snapshotDirectoryCount': 0, 'snapshotFileCount': 0, 'snapshotLength': 0, 'snapshotSpaceConsumed': 0, 'spaceConsumed': 0, 'spaceQuota': -1, 'typeQuota': {}}


## 遍历目录


In [40]:
for file in hdfs_client.walk("/"):
    print(file)

('/', ['a'], [])
('/a', ['b'], [])
('/a/b', ['c'], [])
('/a/b/c', [], [])


# 读/写/删除

## 普通写

### attiontion

* 需要有通往namenode/datanode的网络(需要配置hosts文件,解析datanode的域名到127.0.0.1)
如果没有网络，会报错 ConnectionError: HTTPConnectionPool(host='e149fa31e917', port=9864): Max retries exceeded with url
* 其实从这里也能看出来,因为namenode仅仅负责路由，只会把datanode的地址返回给我们，让我们自己继续调用，所以必须有datanode的权限

In [41]:
import json
write_data=json.dumps({
    "key":"value"
})
with hdfs_client.write('/test.json') as writer:
    writer.write(write_data)
    
print(hdfs_client.list("/")) 

with hdfs_client.read('/test.json') as reader:
    features = reader.read()
    print(features)


['a', 'test.json']
b'{"key": "value"}'


### append

In [42]:
with hdfs_client.write("/test.json", append=True) as writer:
    writer.write(bytes("Winter is coming again", encoding="utf-8"))
    writer.write(bytes("Winter is coming again", encoding="utf-8"))

with hdfs_client.read("/test.json") as reader:
    print(str(reader.read(), encoding="utf-8")) 

{"key": "value"}Winter is coming againWinter is coming again


### overwrite

In [43]:
with hdfs_client.write("/test.json", overwrite=True) as writer:
    writer.write(bytes("Winter is coming again", encoding="utf-8"))


with hdfs_client.read("/test.json") as reader:
    print(str(reader.read(), encoding="utf-8")) 

Winter is coming again


## 删除

In [44]:
try:
    hdfs_client.delete("/test.json")
except Exception as e:
    print(e) 
print(hdfs_client.list("/"))

['a']


# 上传/下载

In [45]:
hdfs_client.upload(hdfs_path="/", local_path="./test.json")
print("test.json" in hdfs_client.list("/"))  # True

True


In [51]:
hdfs_client.download(hdfs_path="/test.json", local_path="./remote_json")
print(open("./remote_json", "r", encoding="utf-8").read())

{
    "upload":"success"
}


# 文件权限

In [None]:
#设置所有者
# hdfs_client.set_owner
#设置权限
# hdfs_client.set_permission
#设置副本系数
# hdfs_client.set_replication
#设置时间
# hdfs_client.set_times