In [19]:
import csv
import pymongo
from textblob import Word
from pprint import pprint
from collections import Counter
import operator

<h2>讀取等級字典</h2>

In [2]:
with open('vocabulary_list_new.csv', mode='r') as infile:
    reader = csv.reader(infile)
    mydict = {rows[0]:rows[1] for rows in reader}

In [10]:
counts = Counter(mydict.values())
print(counts)

Counter({'6': 1090, '5': 1084, '3': 1069, '4': 1067, '2': 1007, '1': 749})


In [47]:
connection = pymongo.MongoClient('127.0.0.1:27017')
db = connection['MyTest']
DB_news = db['news']
DB_word_dict = db['word_dict']
DB_art_keywords = db['art_keywords']

In [52]:
# 只考慮關鍵字
def define_level(article,mydict):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['keywords'].keys():
        if word in mydict:
            level_dict[mydict[word]] += 1
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
    return leveltag

In [53]:
articles = DB_art_keywords.find({'level':{'$exists':False}})
count=1
for article in articles:
    leveltag = define_level(article,mydict)
    DB_news.find_one_and_update({'_id':article['_id']},{'$set':{'level':leveltag}})
    print(count, ' is updated level tag.')
    count += 1

{'1': 723, '2': 331, '3': 143, '4': 75, '5': 25, '6': 25}


<h2>文章分級方法決定</h2>

https://www.raz-kids.com/main/ViewPage/name/text-leveling-system/

In [None]:
# 只考慮關鍵字
def method5(article,mydict):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['keywords'].keys():
        if word in mydict:
            level_dict[mydict[word]] += 1
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
    return leveltag

In [50]:
articles = DB_art_keywords.find()
l_dict ={'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
for article in articles:
    l_dict[method5(article,mydict)] += 1
print(l_dict)

{'1': 723, '2': 331, '3': 143, '4': 75, '5': 25, '6': 25}


In [21]:
# 加入 IDF 比重
def method4(article, DB_word_dict):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['wordset'].keys():
        if word in mydict:
            fword = DB_word_dict.find_one({'_id':word})
            level_dict[mydict[word]] += fword['IDF']
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
    return leveltag

In [24]:
articles = DB_news.find()
l_dict ={'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
for article in articles[:100]:
    l_dict[method4(article,DB_word_dict)] += 1
print(l_dict)

{'1': 86, '2': 12, '3': 0, '4': 2, '5': 0, '6': 0}


In [25]:
# 加入TF(詞頻) & IDF 比重
def method3(article, DB_word_dict):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['wordset'].keys():
        if word in mydict:
            fword = DB_word_dict.find_one({'_id':word})
            level_dict[mydict[word]] += article['wordset'][word]['TF']*fword['IDF']
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
    return leveltag

In [26]:
articles = DB_news.find()
l_dict ={'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
for article in articles[:100]:
    l_dict[method3(article,DB_word_dict)] += 1
print(l_dict)

{'1': 87, '2': 9, '3': 3, '4': 1, '5': 0, '6': 0}


In [27]:
# 加入TF(詞頻)比重
def method2(article):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['wordset'].keys():
        if word in mydict:
            level_dict[mydict[word]] += article['wordset'][word]['TF']
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
    return leveltag

In [28]:
articles = DB_news.find()
l_dict ={'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
for article in articles[:100]:
    l_dict[method2(article)] += 1
print(l_dict)

{'1': 98, '2': 2, '3': 0, '4': 0, '5': 0, '6': 0}


In [29]:
# 只考慮 wordset裡的占比
def method1(article):
    level_dict = {'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
    for word in article['wordset'].keys():
        if word in mydict:
            level_dict[mydict[word]] += 1
    sorted_dict = sorted(level_dict.items(), key=operator.itemgetter(1), reverse=True)
    leveltag = sorted_dict[0][0]
#     match_percentage = round((len(level_list)/len(article['wordset'])),3)*100
    return leveltag

In [30]:
articles = DB_news.find()
l_dict ={'1':0,'2':0,'3':0,'4':0,'5':0,'6':0}
for article in articles[:100]:
    l_dict[method1(article)] += 1
print(l_dict)

{'1': 98, '2': 2, '3': 0, '4': 0, '5': 0, '6': 0}


<h2>建立等級字典</h2>

In [None]:
# 讀取字典為 dictionary
with open('vocabulary_list.csv', mode='r') as infile:
    reader = csv.reader(infile)
    mydict = {rows[0]:rows[1] for rows in reader}
    
# 定義 stop words
with open('stop-word-list.txt', 'r', encoding='utf8') as mysw:
    swlist = mysw.read().split(',')

# 將字典前處理
newdict = {}
for key,value in mydict.items():
    if key.lower() not in swlist:
        finalWord = Word(key.lower()).lemmatize("v")
        newdict[finalWord]=value

# 看各個等級的字數
counts = Counter(newdict.values())

# 寫出csv
with open('vocabulary_list_new.csv', mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    for key, value in newdict.items():
        writer.writerow((key, value))