-
Notifications
You must be signed in to change notification settings - Fork 45
/
wc_website.py
67 lines (50 loc) · 1.77 KB
/
wc_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import glob
import matplotlib
import matplotlib.pyplot as plt
import wordcloud
import numpy as np
import PIL
import io
import requests
import random
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))
list_files = glob.glob("./temp/course/**/*.Rmd", recursive=True)
book_mask = np.array(PIL.Image.open("./build/python_black.png"))
def read_file(filename):
with open(filename, 'r', encoding='utf-8') as f:
text = f.readlines()
f.close()
new_text = " ".join([line for line in text])
s = new_text
return s
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
def make_wordcloud(corpus):
wc = wordcloud.WordCloud(mask=book_mask, max_words=2000, margin=10, contour_width=3, contour_color='white')
wc.generate(corpus).recolor(color_func=grey_color_func, random_state=3)
return wc
def keep_text_within_shortword(shortcode):
return re.sub(re.compile("(\{\{).*(\}\}\\n)|(\\n\{\{).*(\}\})"),"",shortcode)
def clean_file(text):
text = " ".join(text).lower()
s = keep_text_within_shortword(text)
# REMOVE R CHUNKS ------
s = re.sub(r'(?s)(```\{r)(.*?)(```)', "", s)
s = re.sub(r'`', '', s)
return s
list_content = [read_file(fl) for fl in list_files]
corpus = clean_file(text = list_content)
corpus = corpus.split(" ")
corpus = [w for w in corpus if not w in stop_words]
#corpus = [word for word in corpus if word.isalpha()]
corpus = " ".join(corpus)
fig = plt.figure()
plt.imshow(make_wordcloud(corpus), interpolation='bilinear')
plt.axis("off")
plt.tight_layout()
plt.savefig('./content/home/word.png', bbox_inches='tight', pad_inches = 0, dpi=199)