-
Notifications
You must be signed in to change notification settings - Fork 4
/
scraper.ts
219 lines (195 loc) · 6.55 KB
/
scraper.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import { isAddress } from "@ethersproject/address";
import { chunk } from "../ui/array-utils";
import { ensGraphFetch } from "./fetch-ens-graph";
// Regex matches for addresses and ENS names
const addressRegex: RegExp = /(0x[a-zA-Z0-9])\w+/;
const ENSRegex: RegExp = /([^ ]+.(eth))/i;
type AddressInfo = { addr: string; tweet: string; ens?: string };
class Logger {
logs: [string, string][] = [];
listener: any;
info(info: string) {
console.info(info);
this.logs.push(["info", info]);
if (this.listener) {
this.listener(this.logs);
}
}
error(error: string) {
console.error(error);
this.logs.push(["error", error]);
if (this.listener) {
this.listener(this.logs);
}
}
}
export default class Scraper {
// Optional RPC to resolve ENS names to addresses
// rpc?: providers.JsonRpcProvider | null;
// Tweet conversation ID
conversationID: string;
// Twitter token
twitterBearer: string;
// Number of tokens to distribute per address
numTokens: number;
logger: Logger;
// Collected tweets from Twitter API
tweets: { id: string; text: string }[] = [];
// Cleaned addresses from tweets
addresses: AddressInfo[] = [];
/**
* Setup scraper
* @param {string} conversationID to scrape
* @param {string} twitterBearer 2.0 token
* @param {number} numTokens to distribute per address
* @param {string?} rpcProvider optional rpc endpoint to convert ENS names
*/
constructor(
conversationID: string,
twitterBearer: string,
numTokens: number,
// rpcProvider?: string
) {
this.conversationID = conversationID;
this.twitterBearer = twitterBearer;
this.numTokens = numTokens;
// if (rpcProvider) {
// this.rpc = new providers.StaticJsonRpcProvider(rpcProvider);
// }
this.logger = new Logger();
}
/**
* Generates endpoint to query for tweets from a thread
* @param {string?} nextToken if paginating tweets
* @returns {string} endpoint url
*/
generateEndpoint(nextToken?: string): string {
const baseEndpoint: string =
"https://dark-resonance.isiain.workers.dev/2/tweets/search/recent?query=conversation_id:" +
// Append conversation ID
this.conversationID +
// Collect max allowed results
"&max_results=100";
// If paginating, append next_token to endpoint
return nextToken ? `${baseEndpoint}&next_token=${nextToken}` : baseEndpoint;
}
/**
* Recursively collect tweets from a thread (max. 100 per run)
* @param {string?} nextSearchToken optional pagination token
*/
async collectTweets(nextSearchToken?: string): Promise<void> {
// Collect tweets
const headers = {}
if (this.twitterBearer) {
headers['authorization'] = `Bearer ${this.twitterBearer}`
}
const resp = await fetch(await this.generateEndpoint(nextSearchToken), {
headers,
});
const data = await resp.json();
// Append new tweets
const tweets: Record<string, string>[] = data.data;
this.tweets.push(...data.data);
this.logger.info(`Collected ${tweets.length} tweets`);
const nextToken: string | undefined = data.meta.next_token;
// If pagination token exists:
if (nextToken) {
// Collect next page of tweets
await this.collectTweets(nextToken);
}
}
/**
* Cleans individual tweets, filtering for addresses
*/
cleanTweetsForAddresses(): void {
for (const tweet of this.tweets) {
// Remove line-breaks, etc.
const cleanedText: string = tweet.text.replace(/(\r\n|\n|\r)/gm, "");
const foundAddress: RegExpMatchArray | null =
cleanedText.match(addressRegex);
const foundENS: RegExpMatchArray | null = cleanedText.match(ENSRegex);
for (const foundArrs of [foundAddress, foundENS]) {
// If match in tweet
if (foundArrs && foundArrs.length > 0) {
// lower-case ens names for accidental upper-case
if (foundArrs[0].match(/ens$/i) || foundArrs[0].match(/[A-Z]/)) {
this.addresses.push({addr: foundArrs[0].toLowerCase(), tweet: tweet.text})
}
// If type(address)
const addr: string = foundArrs[0].startsWith("0x")
? // Quick cleaning to only grab first 42 characters
foundArrs[0].substring(0, 42)
: foundArrs[0];
// Push address or ENS name
this.addresses.push({ addr, tweet: tweet.text });
}
}
}
}
/**
* Convert ENS names to addresses
*/
async convertENS(): Promise<void> {
let validAddresses: { addr: string; ens?: string; tweet: string }[] = [];
const ensCheckAddresses = [];
this.addresses.map((address) => {
console.log(address);
if (isAddress(address.addr.toLowerCase())) {
validAddresses.push(address);
this.logger.info(`address-like ${address.addr} valid`);
} else if (address.addr.includes(".eth")) {
ensCheckAddresses.push(address);
} else {
this.logger.error(`address-like ${address.addr} invalid`);
}
});
for (const chunkPart of chunk(ensCheckAddresses, 25)) {
const graphResults = await ensGraphFetch(
chunkPart.map((address) => address.addr)
);
console.log("has graph result", graphResults);
for (const addrPart of chunkPart) {
console.log({ graphResults, addrPart });
const foundEns = graphResults.find(
(result) => result.ens === addrPart.addr
);
console.log({ foundEns });
if (foundEns) {
validAddresses.push({
addr: foundEns.addr,
ens: foundEns.ens,
tweet: addrPart.tweet,
});
this.logger.info(
`Found address ${foundEns.ens} from ${addrPart.addr}`
);
} else {
this.logger.error(
`Could not resolve ${addrPart.addr} -- ${addrPart.tweet.replace(
"\n",
" "
)}`
);
}
}
}
this.addresses = validAddresses;
}
/**
* Scrape tweets, find addresses, output batch copyable disperse files
*/
async scrape(scrapedStrings: (addrs: AddressInfo[]) => void = () => {}) {
// Collect all tweets from thread
await this.collectTweets();
this.logger.info(`Collected ${this.tweets.length} total tweets`);
// Clean tweets, finding addresses and ENS names
await this.cleanTweetsForAddresses();
this.logger.info(
`Collected ${this.addresses.length} addresses from tweets`
);
scrapedStrings(this.addresses);
await this.convertENS();
this.logger.info("Converted ENS names to addresses");
return this.addresses;
}
}