-
Notifications
You must be signed in to change notification settings - Fork 5
/
config.yml
184 lines (158 loc) · 5.14 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# controls which steps to do when starting main.py
process:
scraping: False
dataPreparation: True
modelTrainingLIWC: False
derivePersonalities: False
modelTrainingGloVe: False
scraping:
# number of seconds to stream tweet data
timer: 14400 # 180
# max follower number for eligible users
# 0 = no limit
user_max_followers: 10000
# at least x followers for eligible users
user_min_followers: 10
# minimum tweet number for eligible users (incl. retweets)
# 0 = no limit
users_min_tweet_no: 400
# the number of users to select followers from
# the API is limited to 15 calls per 15 minutes
sampling_follower: 45 # 15
# how many eligible users should be (randomly) included
# from the location scraping
sampling_location_users: 200 # 100
# total sample size of users we try to achieve
# must be greater or equal to sampling_location_users
total_sample_size: 1100
# determines if files are written during the process or read
# helps to recreate steps with the same data
scrapingByLoc:
writeFile: False
readFile: True
followerSelect:
writeFile: False
readFile: True
userSelect:
writeFile: False
readFile: True
# determines if the location of followers should be validated or not
validateLocation: True
preparationProcess:
printStatistics: True
# if this is set true, the country specific user file (e.g. 03verifiedGermanytweets.csv)
# contains only one column with user IDs
# the programm will then retrieve the user objects and corresponding tweets
# read files should be set to false for this
hydrateUserID: True
# list of countries (subset of countries defined under twitter)
# for these countries we will get profiles from ibm
# e.g. USA (or NONE, if nothing should be retrieved)
countriesIBM:
- NONE
# - USA
# determines if files are written during the process or read
# helps to recreate steps with the same data
condenseTweets:
writeFile: True
readFile: False
getIBMprofile:
writeFile: True
readFile: False
# for LIWC we always write the files if read is false
# since it's a separate program
liwc:
writeFile: True
readFile: False
# relative path for LIWC results
path: 'data/liwcInput/'
# will be prefixed with country name
# e.g.: USAliwcResult.csv
fileName: 'liwcResult.csv'
modelTraining:
# defines which target labels should be used in model training process
# could be e.g. extended to use facets as well
# labels need to exist in Profile
labelsGlobalList:
- big5_openness
- big5_conscientiousness
- big5_extraversion
- big5_agreeableness
- big5_neuroticism
# print details about training scores etc
printDetailResults: True
# binary files liwc
writePickleFiles: False
readPickleFiles: False
# ONNX model liwc
writeONNXModel: False
readONNXModel: True
# csv files after liwc prediction
writeFile: False
readFile: True
# binary files for glove
writePickleFilesG: False
readPickleFilesG: False
# ONNX model for glove
writeONNXModelG: False
readONNXModelG: True
# calculated glove features can be exported
# or read; if read is False, features are calculated
# on the fly
writeFeatureFile: False
readFeatureFile: True
# file or data base file for glove data
# only relevant if features are not loaded from file
# data/glove/glove.db for data base
# data/glove/glove_vectors.txt for plain text file
glove_path: 'data/glove/glove.db'
# if path points to data base file set to True
# otherwise False
glove_database: True
twitter:
# max tweets per user used for mining
# (3200 is a max limit given by Twitter)
# lower limits might be useful due to API rate limits
# this is the number of tweets excluding retweets
user_max_tweet_no: 250 # 200
# if True prints a message, when Twitter's API limits are reached
wait_on_rate_limit_notify: True
# remove line breaks from tweet text
# csv files look odd when they contain new line characters
# if set to true tweet texts will be cleaned of those
remove_new_line: True
# ignore retweets, since they are not written by the user
ignore_retweets: True
# relevant additional tweet attributes
# (used while retrieving tweets)
# id_str, created_at, user_id, isRetweet and text are always included
add_attributes:
- lang
- is_quote_status
# for filtering tweets based on location
# coordinates are needed
# we want to get tweets from multiple locations
# (USA and Germany), hence we provide multiple countiries
coordinates:
# USA:
# name: 'USA' # should match Google Places API result
# lang: 'en'
# langThreshold: 0.8
# otherLangThreshold: 0.05
# northeast: # state Maine, USA
# lat: 47.459833
# lng: -66.885417
# southwest: # state California, USA
# lat: 32.528832
# lng: -124.482003
Germany:
name: 'Germany'
lang: 'de'
langThreshold: 0.8
otherLangThreshold: 0.05
northeast: # Mecklenburg-Vorpommern according to Google API
lat: 54.6847005
lng: 14.4122569
southwest: # Baden-Württemberg according to Google API
lat: 47.5323664
lng: 7.511756799999999