Skip to content

Commit

Permalink
clemson credit-card data set
Browse files Browse the repository at this point in the history
  • Loading branch information
mfitzi committed Jan 19, 2022
1 parent c456496 commit e43255e
Show file tree
Hide file tree
Showing 10 changed files with 901,129 additions and 0 deletions.
24 changes: 24 additions & 0 deletions datasets/twitchclemson/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Spending-Transparency Data from Clemson University (SC, USA)

- Extracted all records with vendor/payee "American Express" expenses from Clemson reporting for 2020 published at
https://transpend.app.clemson.edu/transparency-spending/vendor/search/

(scraped by hand per month and pages --- as they have no better options)

-> file clemson2020.csv (tab separated)

clemson.xlsx
- imported clemson2020.csv
- Deleted all negative amounts (credits) as not relatable to respective debits
- the graphs show the number of transaction per amount up to USD 500,
once rounded to 1 USD, once rounded to 5 USD.


Exported all transactions with amounts from USD 0 to USD 100 while, rounding the amounts to USD integer
-> file clemsonDistribution.csv (705 records)

Built "duration" file

Extracted first 1000 amounts in "duration" file into sampledFromDuration.csv (verifyDistribution.awk)

Plotted result in clemson.xlsx (third graph)
88 changes: 88 additions & 0 deletions datasets/twitchclemson/build.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env node

/* Generate a T2P2 dataset from the "Twitch Gamers Social Network" dataset ().
*
* Usage:
* ./build.mjs
* PROACTIVE=0.8 ./build.mjs
*
* Available ENV vars:
*
* DURATION Duration of the simulation, in seconds [default: 3600]
* TPS Target TPS for the simulation (i.e. number of event per step) [default: 250]
* PROACTIVE Pro-active snapshot parameter (i.e. 's') [default: 1]
*/

import fs from "fs";
import path from "path";
import { Writable } from "stream";
import
{ csvWriter
, forEachLine
, pickOne
, someAmount
, someSize
, __dirname
} from "../helpers.mjs";
import
{ buildPriceTable
} from "./myhelpers.mjs";

const INPUT_FILENAME = path.join(__dirname, "twitchclemson", "source.csv");
const DURATION = parseInt(process.env["DURATION"] || "3600", 10);
const TARGET_TPS = parseInt(process.env["TPS"] || "250", 10);
const PROACTIVE_SNAPSHOT = Number(process.env["PROACTIVE"] || "1");
const OUTPUT_FILENAME = path.join(path.dirname(INPUT_FILENAME), `duration:${DURATION}-tps:${TARGET_TPS}-proActive:${PROACTIVE_SNAPSHOT}.csv`)

const PRICE_TABLE_FILENAME = path.join(__dirname, "twitchclemson", "clemsonDistribution.csv");

console.log(`DURATION: ${DURATION}`);
console.log(`TPS: ${TARGET_TPS}`);
console.log(`PROACTIVE: ${PROACTIVE_SNAPSHOT}`);
console.log("");

const graph = [];
const nodes = new Map();

// Price table
// 1st col: price, 3rd col number of records with a price at most the given price
const pt = buildPriceTable(PRICE_TABLE_FILENAME);

let ix = 0;
fs.createReadStream(INPUT_FILENAME)
.pipe(forEachLine(cols => {
if (!nodes.has(cols[0])) { nodes.set(cols[0], ix += 1); }
if (!nodes.has(cols[1])) { nodes.set(cols[1], ix += 1); }
graph.push({
a: nodes.get(cols[0]),
b: nodes.get(cols[1])
})
}))
.on('finish', () => {
console.log(`Processed graph with ${graph.length} edges & ${nodes.size} nodes!`)
const writer = csvWriter(OUTPUT_FILENAME);
writer.write("slot,clientId,event,size,amount,recipients");
process.stdout.write("Generating events");
for (let sl = 0; sl < DURATION; sl += 1) {
if (sl % 10 === 0) { process.stdout.write(".") }
for (let k = 0; k < TARGET_TPS; k += 1) {
const edge = pickOne(graph);
let source, target;
if (Math.random() > 0.5) {
source = edge.a;
target = edge.b;
} else {
source = edge.b;
target = edge.a;
}
const ev = [sl, source, "new-tx", someSize(), pt.getPrice(), target].join(",")
writer.write(ev);
}
}
process.stdout.write(`\n\nDone → ${OUTPUT_FILENAME}\n`);
writer.end();
});

function asc(k) {
return (a,b) => a[k] - b[k];
}
Binary file added datasets/twitchclemson/clemson.xlsx
Binary file not shown.
746 changes: 746 additions & 0 deletions datasets/twitchclemson/clemson2020.csv

Large diffs are not rendered by default.

102 changes: 102 additions & 0 deletions datasets/twitchclemson/clemsonDistribution.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# rounded amount, occurences, accumulated occurences up to this amount
0,37,37
1,79,116
2,49,165
3,37,202
4,23,225
5,21,246
6,21,267
7,12,279
8,10,289
9,12,301
10,15,316
11,20,336
12,3,339
13,17,356
14,8,364
15,7,371
16,12,383
17,7,390
18,11,401
19,9,410
20,9,419
21,7,426
22,5,431
23,4,435
24,3,438
25,6,444
26,5,449
27,4,453
28,2,455
29,1,456
30,3,459
31,0,459
32,9,468
33,3,471
34,7,478
35,0,478
36,2,480
37,2,482
38,3,485
39,6,491
40,3,494
41,2,496
42,1,497
43,6,503
44,2,505
45,2,507
46,3,510
47,2,512
48,0,512
49,2,514
50,5,519
51,0,519
52,0,519
53,4,523
54,2,525
55,1,526
56,2,528
57,0,528
58,1,529
59,1,530
60,4,534
61,2,536
62,0,536
63,1,537
64,1,538
65,6,544
66,2,546
67,3,549
68,1,550
69,2,552
70,1,553
71,0,553
72,2,555
73,2,557
74,0,557
75,3,560
76,1,561
77,3,564
78,2,566
79,3,569
80,1,570
81,1,571
82,0,571
83,0,571
84,0,571
85,1,572
86,2,574
87,0,574
88,2,576
89,1,577
90,1,578
91,1,579
92,1,580
93,1,581
94,3,584
95,0,584
96,1,585
97,1,586
98,2,588
99,2,590
100,0,590

0 comments on commit e43255e

Please sign in to comment.