/
data_mining.cpp
160 lines (139 loc) · 7.41 KB
/
data_mining.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
//
// Created by giacomo on 29/01/2020.
//
#include <vector>
#include <csv.h>
#include <ostream>
#include <iostream>
#include <dm/dataset/ClashRoyaleDatasetMatch.h>
#include <fstream>
#include <algorithm>
#include <string>
#include <sstream>
#include <set>
#include <cmath>
#include <dm/DataMiningMetric.h>
#include <dm/RulesFromFrequentItemset.h>
#ifdef _WIN64
#ifndef F_OK
#define F_OK (00)
#endif
#include <iterator> //inserter
#else
extern "C" {
#include <unistd.h>
}
#endif
int main(void) {
// Loading the Clash Royale dataset. Even though we have more than 200 dimensions, we are only interested in a few of them
// This file doesn't fit as it is in GitHub. So, pick the cr_data_1535999786739.zip file from the dataset_clashroyale
// submodule, and unzip it in 'data/cr_data_1535999786739.csv'
std::string path = "data/cr_data_1535999786739.csv";
// File where we can store the previously computed frequent patterns by the FPGrowth algorithm
std::string patterns_file = "data/mined_patterns.txt";
std::set<Pattern<std::string>> patterns;
// Checking if the file exists: if the file doesn't exist, then I need to recompute the frequent itemsets from the database
if (!(access(patterns_file.c_str(), F_OK) != -1)) {
// Setting ^ as an escape characeter, and using all the possible spaces to be trimmed.
io::CSVReader<31, io::trim_chars<' ', '\t'>, io::no_quote_escape<'^'>> clash_royale{path};
// Defining which are the relevant columns to be read, and ignoring the others
clash_royale.read_header(io::ignore_extra_column, "type", "winner", "utctime", "name_P1", "tag_P1", "clan_P1", "startTrophies_P1", "crownsEarned_P1", "human_deck_P1", "strenght_P1_0", "strenght_P1_1", "strenght_P1_2", "strenght_P1_3", "strenght_P1_4", "strenght_P1_5", "strenght_P1_6", "strenght_P1_7", "name_P2", "tag_P2", "clan_P2", "startTrophies_P2", "crownsEarned_P2", "human_deck_P2", "strenght_P2_0", "strenght_P2_1", "strenght_P2_2", "strenght_P2_3", "strenght_P2_4", "strenght_P2_5", "strenght_P2_6", "strenght_P2_7");
// Vector storing all the transactions read from the CSV file
std::vector<Transaction<std::string>> transactions;
{
// Shorthand for a set of transaction sorted in lexicographical order
using DeckSet = std::set<Transaction<std::string>, VTLexic>;
// Separating the decks that wins from the decks that lose some tournaments
DeckSet winning_deck, losing_deck;
// Reading all the elements in the CSV file
std::cout << "Reading and projecting all the battles from the dataset..." << std::endl;
bool continueReading = true;
do {
// Read a line from the file if possible
const auto x = ClashRoyaleDatasetMatch::read(clash_royale);
if (x) { // If it was possible to read the information from the file
winning_deck.emplace(x->getWinner().asFPTransaction());
losing_deck.emplace(x->getLoser().asFPTransaction());
// Load both the winning and the losing decks
} else {
continueReading = false; // Otherwise, fail and stop reading
}
} while (continueReading);
/*
* Now, we want to obtain the set of decks that are always winning: that information will be then fed to the FPGrowth algorithm
*/
std::cout << "Initial number of winning moves: " << winning_deck.size() << std::endl;
std::cout << "Number of losing decks: " << losing_deck.size() << std::endl;
{
DeckSet c;
std::set_difference(std::make_move_iterator(winning_deck.begin()),
std::make_move_iterator(winning_deck.end()),
losing_deck.begin(), losing_deck.end(),
std::inserter(c, c.begin()));
winning_deck.swap(c);
}
std::cout << "Resulting number of always winning decks: " << winning_deck.size() << std::endl;
std::copy(winning_deck.begin(), winning_deck.end(), std::back_inserter(transactions));
}
std::cout << std::endl << std::endl;
size_t minimum_support_threshold = 10000; // The pattern is frequent, and therefore supported by at least 10,000 rounds
std::cout << "Feeding the FPGrowth Algorithm..." << std::endl;
{
std::cout << "a) creating the tree" << std::endl;
const FPTree<std::string> fptree{ transactions, minimum_support_threshold };
std::cout << "b) computing the most frequent patterns" << std::endl;
patterns = fptree_growth( fptree );
}
{
std::cout << "c) Saving all the patterns in the file as a backup" << std::endl;
std::ofstream mined_patterns{patterns_file};
for (const Pattern<std::string>& pattern : patterns) {
mined_patterns << pattern.second << " : ";
for (const std::string& item : pattern.first) {
mined_patterns << item << " ";
}
mined_patterns << std::endl;
}
}
} else {
// ... If the file exists, then:
std::cout << "Patterns have been already mined in a previous computation: loading those from the file!" << std::endl;
// Opening the existing file containing the patterns
std::ifstream mined_patterns{patterns_file};
// The variable containing each line from the file
std::string line;
// Reading all the lines
while (std::getline(mined_patterns, line)) {
std::istringstream iss(line);
uint64_t frequency;
std::string item;
char delimiter;
// The first element in the row is the frequency value and some delimiter.
iss >> frequency >> delimiter;
// Assertion on the delimiter separating the frequency from the items
assert(delimiter = ':');
// Storing all the elements in the pattern
std::set<std::string> pattern;
while (iss >> item) pattern.emplace(item);
// Storing the resulting pair in the set
Pattern<std::string> mined{pattern, frequency};
patterns.emplace(mined);
}
}
/*
* Initializing the scoring functions using the mined patterns from the FPGrowth algorithm
*/
DataMiningMetrics counter{patterns};
size_t count = 0;
// For each frequent itemset, generate a set of possible relevant rules
for (const Pattern<std::string>& pattern : patterns) {
if (pattern.first.size() <= 1) continue; // I cannot extract any significant rule from a singleton!
RulesFromFrequentItemset rffi{counter}; // Generate the top rule
for (auto& result: rffi.generate_hypotheses(pattern)) { // Generate the hypotheses containing a lift greater than one
std::cout << result << std::endl; // Printing each successful outcome
count++;
}
}
std::cout << std::endl << "~ We mined " << patterns.size() << " patterns " << std::endl;
std::cout << "~ from which we generated " << count << " rules!" << std::endl;
}