-
Notifications
You must be signed in to change notification settings - Fork 0
/
slide.tex
331 lines (287 loc) · 9.49 KB
/
slide.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
\documentclass[presentation]{beamer}
\usepackage{hyperref}
\usepackage{color}
\usepackage{listings}
\usepackage{multicol}
\usepackage{tikz}
\usetikzlibrary{graphs}
\usetikzlibrary{quotes}
\usetheme{Madrid}
\author{Hebi Li, James Nhan}
\title{Automated Project Rater}
\begin{document}
\maketitle
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Problem Definition}
\begin{frame}{Problem Definition}
Given a project, predict the quality of it using simple and easy to
get file level features.
\end{frame}
\begin{frame}{Motivation}
\begin{itemize}
\item Open Source Projects are of different qualities
\item Hard to tell the quality without a proper rating accumulated by
user (like GitHub stars)
\item Useful for company searching for tools
\item Useful for learners and contributors to select good projects
\end{itemize}
\end{frame}
\begin{frame}{Challenges}
\begin{itemize}
\item No data sets available
\item Complex features are hard to get
\begin{itemize}
\item Control flow features such as program dependence, call graph,
def use information, symbol table are expensive to get
\item Data stored on server requires using API (e.g. GitHub API)
\item They typically have access limit (e.g. GitHub API allows only
\textbf{30 requests per minute} for authenticated user)
\end{itemize}
\item Simple features might not be able to express the rating
\end{itemize}
\end{frame}
\section{Related Work \& Contribution}
\begin{frame}{Related Work}
Related Work
\begin{itemize}
\item No previous work has been done to rate project quality
\item No such data set is public available
\item Some research in MSR is related. E.g. OpenHub
\footnote{OpenHub: A Scalable Architecture for the Analysis of
Software Quality Attributes, MSR 2014, Farah, Gabriel et al.},
Documentation mining \footnote{On Mining Crowd-based Speech
Documentation, MSR '16, Moslehi, Parisa et al.}. See the related
work section in our paper.
\end{itemize}
Our contribution
\begin{itemize}
\item Publicly available data set for 10000 projects with
15 features each with GitHub star number as ground truth.
\item Analyze the data set and features using linear regression, SVM
with linear and non-linear kernels, decision tree approaches
\end{itemize}
\end{frame}
\section{Workflow}
\begin{frame}{Workflow}
% Github Search API ->
% -> downloader -> feature extractor -> analyzer -> predict model
\centering
\begin{tikzpicture}
\graph[grow down sep=1cm, nodes={draw, rounded corners},
edge quotes={font=\footnotesize}] {
githubapi/"Github API" -> ["Top 1000 repositories meta data for each language"]
downloader/"Downloader" -> ["Project Files"]
extractor/"Feature Extractor" -> ["Features"]
analyzer/"Analyzer" -> ["Predict Model"]
predictor/"Predictor"
};
% \graph[grow right sep=2cm] {
% github api -> b -> c;
% };
% \node (githubapi) at (0,0) {Github API};
% \node (downloader) at (4,0) {Downloader};
% \node (extractor) at (4,0) {Feature Extractor};
% \node (analyzer) at (6,0) {Analyzer};
% \draw[->] (githubapi) -- (downloader) -- (extractor) -- (analyzer);
\end{tikzpicture}
\end{frame}
\section{Data Collection}
\begin{frame}{Ground Truth}
We use GitHub star number as ground truth.
\end{frame}
\begin{frame}{Feature}
\begin{multicols}{2}
A list of features
\begin{itemize}
\item has download?
\item has issue?
\item has wiki?
\item has page?
\item open issue
\item size
\item test file
\item doc file
\item src file
\item loc
\item comment
\item file depth
\item file count
\item dir branching factor
\item dir count
\item fork count
\end{itemize}
\end{multicols}
\end{frame}
\begin{frame}{Data Collection}
\begin{enumerate}
\item Use GitHub search APIs to query top 1000 projects by star number
\begin{itemize}
\item Languages used: C, Java, JavaScript, shell,
Python, Ruby, PHP
\item GitHub search API can at most return 1000 for a search
\item We are interested in the top rated projects.
\item 1000 projects roughly containing projects rating from 300
stars to 10,000-30,000 stars
\item Different languages can
\begin{itemize}
\item Give us overall result regardless of language
\item Show which language works best
\item Compare results across languages
\end{itemize}
\end{itemize}
\item Download projects
\item Extract features
\end{enumerate}
\end{frame}
\begin{frame}{Data Collection Continued}
\begin{enumerate}
\item use GitHub search APIs to query top 1000 projects by star number
\item Download projects
\begin{itemize}
\item We analyze the features using scripts offline, to avoid the
query rate limit posed by GitHub
\end{itemize}
\item Extract features
\begin{itemize}
\item Features are stored into SQLite database for query and update
\item Final features and response (ground truth star number) are
written into csv file for analyze
\end{itemize}
\end{enumerate}
\end{frame}
\section{Model Training}
\begin{frame}{Analysis Algorithm}
\begin{itemize}
% \item linear model
\item Support Vector Machine with kernels:
\begin{itemize}
\item Linear
\item Polynomial
\item Radial Basis
\item Sigmoid
\end{itemize}
% \item Decision Tree
\end{itemize}
\end{frame}
\begin{frame}{Analysis Algorithm Continued}
We didn't tune the parameters because of less of precise knowledge
about how to tune them. Model selection turns out to be a issue for
the data set that is new and not familiar.
\end{frame}
\section{Result \& Conclusion}
\begin{frame}{Results}
\begin{figure}[ht]
\centering
\includegraphics[height=0.8\textheight]{star-hist.png}
% \caption{Data set visualization}
\end{figure}
\end{frame}
\begin{frame}{Model Prediction Visualization}
\begin{figure}[ht]
\scriptsize
\centering
\includegraphics[width=0.3\textwidth]{images/wholepng/whole-linear.png}
\includegraphics[width=0.3\textwidth]{images/wholepng/whole-polynomial.png}\\
\includegraphics[width=0.3\textwidth]{images/wholepng/whole-radial.png}
\includegraphics[width=0.3\textwidth]{images/wholepng/whole-sigmoid.png}
\caption{Linear, Polynomial, Radial Basis, Sigmoid kernels}
\end{figure}
\end{frame}
\begin{frame}{Model Prediction Visualization}
\begin{figure}[ht]
\scriptsize
\centering
\includegraphics[width=0.3\textwidth]{images/langpng/c-linear.png}
\includegraphics[width=0.3\textwidth]{images/langpng/c-radial.png}\\
\includegraphics[width=0.3\textwidth]{images/langpng/javascript-linear.png}
\includegraphics[width=0.3\textwidth]{images/langpng/javascript-radial.png}\\
\caption{Linear(Left) and Polynomial(Right), for C(top) and JavaScript(Bottom)}
\end{figure}
\end{frame}
\begin{frame}{Model Prediction Visualization}
\begin{figure}[ht]
\scriptsize
\centering
\includegraphics[width=0.3\textwidth]{images/langpng/python-linear.png}
\includegraphics[width=0.3\textwidth]{images/langpng/python-radial.png}\\
\includegraphics[width=0.3\textwidth]{images/langpng/ruby-linear.png}
\includegraphics[width=0.3\textwidth]{images/langpng/ruby-radial.png}\\
\caption{Linear(Left) and Polynomial(Right), for Python(top) and Ruby(Bottom)}
\end{figure}
\end{frame}
\begin{frame}{Results Continued}
Support Vector Machine with Variety of Models
\begin{table}
\centering
\begin{tabular}{r|r}
Category & Accuracy\\
\hline
2 & 0.83\\
3 & 0.71\\
4 & 0.61\\
5 & 0.54\\
6 & 0.47\\
7 & 0.42\\
8 & 0.38\\
9 & 0.35\\
10 & 0.32\\
\end{tabular}
\caption{Discretization v.s. Accuracy}
\end{table}
\end{frame}
\begin{frame}{Results Continued}
\begin{table}
\centering
\scriptsize
\begin{tabular}{r|r|r|r|r|r|r|r}
Category & c.csv & php.csv & java.csv & javascript.csv & shell.csv & ruby.csv & python.csv\\
\hline
2 & 0.92 & 0.93 & 0.84 & 0.56 & 0.95 & 0.88 & 0.83\\
3 & 0.85 & 0.87 & 0.7 & 0.39 & 0.92 & 0.78 & 0.74\\
4 & 0.79 & 0.82 & 0.61 & 0.31 & 0.87 & 0.69 & 0.63\\
5 & 0.76 & 0.76 & 0.52 & 0.22 & 0.82 & 0.63 & 0.56\\
6 & 0.72 & 0.71 & 0.47 & 0.17 & 0.8 & 0.58 & 0.51\\
7 & 0.67 & 0.67 & 0.42 & 0.15 & 0.77 & 0.53 & 0.46\\
8 & 0.63 & 0.63 & 0.36 & 0.15 & 0.74 & 0.49 & 0.42\\
9 & 0.6 & 0.59 & 0.33 & 0.11 & 0.71 & 0.46 & 0.39\\
10 & 0.57 & 0.57 & 0.3 & 0.1 & 0.69 & 0.44 & 0.35\\
\end{tabular}
\caption{Compare across different languages}
\end{table}
\end{frame}
\begin{frame}{Results Continued}
\begin{table}
\centering
\begin{tabular}{r|r|r|r|r}
Category & Linear & Polynomial & Radial & Sigmoid\\
\hline
2 & 0.82 & 0.82 & 0.83 & 0.8\\
3 & 0.71 & 0.69 & 0.71 & 0.67\\
4 & 0.61 & 0.58 & 0.61 & 0.58\\
5 & 0.54 & 0.49 & 0.54 & 0.5\\
6 & 0.47 & 0.43 & 0.47 & 0.45\\
7 & 0.42 & 0.36 & 0.43 & 0.4\\
8 & 0.38 & 0.33 & 0.38 & 0.36\\
9 & 0.35 & 0.29 & 0.35 & 0.33\\
10 & 0.32 & 0.26 & 0.32 & 0.29\\
\end{tabular}
\caption{Different Kernels}
\end{table}
\end{frame}
\begin{frame}{Conclusions \& Future Work}
Our research shows that,
\begin{itemize}
\item The features we use can be used to predict the quality.
\item Different kernels, without tuning of parameters, do not have
much different performance.
\end{itemize}
Future Work
\begin{itemize}
\item More advanced features
\item Resolve bias of data (e.g. more projects with lower stars)
\item Integrate with other research
\end{itemize}
\end{frame}
\end{document}