-
Notifications
You must be signed in to change notification settings - Fork 0
/
findduplicatefiles.go
135 lines (114 loc) · 3.21 KB
/
findduplicatefiles.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package findduplicatefiles
import (
"bufio"
"crypto/sha256"
"encoding/hex"
"flag"
"fmt"
"os"
"path/filepath"
)
func main() {
var dir string
var chunk int
flag.StringVar(&dir, "directory", "", "The directory to search for duplicate files.")
flag.IntVar(&chunk, "chunk", 1, "Size of initial hash to check. 1 indicates the full file hash. 2 is half etc.")
flag.Parse()
if dir == "" {
fmt.Println("No given directory to search.")
os.Exit(0)
}
duplicates := FindDuplicateFiles(dir, chunk)
fmt.Printf("Found the following duplicate files in directory %s :\n", dir)
fmt.Println(duplicates)
}
// FindDuplicateFiles returns duplicates in the given directory, using chunk to determine the initial hash size. 1 is full file, 2 is half etc.
// First groups files by size, eliminating unique files. Then, finds duplicates by hash.
// Use a larger chunk (> 1) for directories of large files.
func FindDuplicateFiles(dir string, chunk int) [][]string {
// [[dup1, dup2], [dupA1, dupA2]] etc.
duplicates := make([][]string, 0)
fileSizes := findDuplicatesBySize(dir)
for size, files := range fileSizes {
if size == 0 {
if len(files) > 1 {
duplicates = append(duplicates, files)
}
continue
}
size := int(size)
duplicateFiles := findDuplicatesByHash(files, chunk, size)
if duplicateFiles != nil {
for _, df := range duplicateFiles {
duplicates = append(duplicates, df)
}
}
}
return duplicates
}
func findDuplicatesBySize(dir string) map[int64][]string {
// map[size] -> [files of this size]
fileSizes := make(map[int64][]string)
err := filepath.Walk(dir,
func(path string, info os.FileInfo, err error) error {
check(err)
if !info.IsDir() {
fileSizes[info.Size()] = append(fileSizes[info.Size()], path)
}
return nil
})
check(err)
return fileSizes
}
func findDuplicatesByHash(files []string, fileChunk int, fileSize int) [][]string {
// Given files of the same size, we now check the hash of the files (up to the chunk)
// to see which are equal, then check the full hash if they are.
hashes := make(map[string][]string)
duplicates := make([][]string, 0)
seekAt := fileSize / fileChunk
for _, f := range files {
fileHash := generateHash(f, seekAt)
hashes[fileHash] = append(hashes[fileHash], f)
}
fullFileHashes := make(map[string][]string)
// fileChunk 1 means we've already checked the full file hash
if fileChunk == 1 {
fullFileHashes = hashes
} else {
for _, files := range hashes {
for _, f := range files {
// Setting chunk to fileSize means we
// generate a hash for the full file
fullHash := generateHash(f, fileSize)
fullFileHashes[fullHash] = append(fullFileHashes[fullHash], f)
}
}
}
for _, files := range fullFileHashes {
if len(files) > 1 {
duplicates = append(duplicates, files)
}
}
return duplicates
}
func generateHash(fp string, chunk int) string {
f, err := os.Open(fp)
check(err)
defer f.Close()
// Generate hash for chunk only
b := make([]byte, chunk)
r := bufio.NewReader(f)
_, err = r.Read(b)
check(err)
hasher := sha256.New()
_, err = hasher.Write(b)
check(err)
checksum := hasher.Sum(nil)
hexDigest := hex.EncodeToString(checksum)
return hexDigest
}
func check(e error) {
if e != nil {
panic(e)
}
}