# News Source Classification

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint as pp
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import jieba

In [2]:
d = pd.read_csv('/root/files/sql_result.csv')
print(d.columns)

Index(['id', 'author', 'source', 'content', 'feature', 'title', 'url'], dtype='object')


# Only need the x:content and y:source
## first check the label distribution

In [3]:
%matplotlib widget
source_cnt = d['source'].value_counts()
s_cnt = dict(source_cnt)
s_cnt = sorted(s_cnt.items(),key=lambda x:x[1],reverse=True)[:10]
sc = [i[0] for i in s_cnt]
cnt = [i[1] for i in s_cnt]
plt.bar(sc,cnt)
plt.show()

## only take the top 10 lables

In [5]:
content = d.content.tolist()
sc_labels_t = {s:i for i,s in enumerate(sc)}
total = len(d)
fd = d[d['source'].isin(set(sc))]
pp(f'delete {total - len(content)}')
pp(sc_labels_t)
fd.head()

'delete 0'
{'中国台湾网': 7,
 '中国新闻网': 3,
 '中国证券报?中证网': 2,
 '南方日报第01版': 6,
 '参考消息网': 4,
 '央广网': 8,
 '微博': 1,
 '新华社': 0,
 '新华网': 9,
 '环球网': 5}


Unnamed: 0,id,author,source,content,feature,title,url
3,89614,,新华社,这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\n,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）,http://world.huanqiu.com/hot/2017-06/10866126....
5,89612,张怡,中国证券报?中证网,受到A股被纳入MSCI指数的利好消息刺激，A股市场从周三开始再度上演龙马行情，周四上午金...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",金融股一枝独秀 配置价值犹存,http://www.cs.com.cn/gppd/201706/t20170623_533...
7,89610,申玉彬 整理,中国证券报?中证网,沙漠雄鹰：震荡有利消化套牢筹码\n 周四开盘上证50在银行券商大蓝筹带动下一度涨近2%...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",博友早评：震荡有利消化套牢筹码,http://www.cs.com.cn/gppd/201706/t20170623_533...
9,89608,吴瞬,中国证券报?中证网,6月21日，A股纳入MSCI指数尘埃落定，但当天被寄予厚望的券商股并未扛起反弹大旗。22...,"{""type"":""市场"",""site"":""中证网"",""commentNum"":""0"",""jo...",纳入MSCI指数 A股长期配置价值提升,http://www.cs.com.cn/gppd/201706/t20170623_533...
12,89605,,中国新闻网,中新网6月19日电 据外媒报道，美国底特律一名男子1976年因为一根头发被定谋杀罪，监禁41...,"{""type"":""国际新闻"",""site"":""环球"",""commentNum"":""0"",""j...",因为犯罪现场的一根头发，他坐冤狱41年后终获释,http://world.huanqiu.com/hot/2017-06/10866136....


In [6]:
content = fd.content.tolist()
content = [str(i).replace('\n','').replace('\u3000','') for i in content]
sc_labels = fd.source.apply(lambda x:sc_labels_t[x]).tolist()

data = [[c,s] for c,s in zip(content,sc_labels)]

# tools

In [7]:
def cut(s): return list(jieba.cut(s))

# 使用 concurrent 模块多进程处理数据
由于多进程处理数据无法保证处理顺序，故 x和 y 对应不上，这里将 x 和 y 包装好后进行处理。

In [8]:
%%time
from concurrent import futures
def multi_process(func,d):
    ret = []
    def chunks(arr, n):
        return [arr[i:i+n] for i in range(0, len(arr), n)]
    d = chunks(d,20)
    with futures.ProcessPoolExecutor(max_workers=20) as executor:
        f = [executor.submit(func, item) for item in d]
        for future in futures.as_completed(f):
            ret.append(future.result())
    return sum(ret,[]) 

def cut_content(c):
    return [[cut(i[0]),i[1]] for i in c]

T = multi_process(cut_content,data)
cut_content = [t[0] for t in T]
y = [t[1] for t in T]

Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cac

CPU times: user 10 s, sys: 3.55 s, total: 13.6 s
Wall time: 15.1 s


# Design feature by content

## TF-IDF

In [9]:
test_cc = [
    ['你','小子','怎么','回事','小子'],
    ['能','行吗','你']
]
def calculate_inverse_document_frequency(cut_sentences):
    idf = {}
    for c in cut_sentences:
        cache = set()
        for k in c:
            if k in cache:
                continue
            idf[k] = idf.get(k,0) + 1
            cache.add(k)
    return {k : np.log((len(cut_sentences)+1) / (v+1)) + 1 for k,v in idf.items()}

def calculate_term_frequency(cut_sentences):
    _tokens = set()
    for c in cut_sentences:
        _tokens.update(c)
    tf = []
    for c in cut_sentences:
        cnt = Counter(c)
        tf.append({k:v for k,v in cnt.items()})
    return list(_tokens),tf


def calculate_tfidf(cut_sentences,norm='l1'):
    def normalize(v):
        e = {
            'l1':1,
            'l2':2
        }.get(norm)
        return [k / sum(map(lambda x:x**e,v)) for k in v]
    idf = calculate_inverse_document_frequency(cut_sentences)
    tokens,tf = calculate_term_frequency(cut_sentences)
    vocab = {v:k for k,v in enumerate(tokens)}
    tfidf = []
    for i in range(len(tf)):
        tfidf.append({k:v * idf[k] for k,v in tf[i].items()})
    return tfidf,vocab

def transfer_to_sparse_matrix(tfidf,vocab):
    # first new zeros matrix
    import scipy.sparse as sparse
    n = np.zeros((len(tfidf),len(vocab)))
    for i,t in enumerate(tfidf):
        for k,v in t.items():
            n[i][vocab[k]] = v
    return  sparse.coo_matrix(n)
 
tfidf,vocab = calculate_tfidf(cut_content)
x = transfer_to_sparse_matrix(tfidf,vocab)

In [10]:
x.shape

(83774, 216071)

In [11]:
from sklearn.linear_model import LogisticRegression
x_train,x_test,y_train,y_test = train_test_split(x,y)
print(x_train.shape)
lr = LogisticRegression(n_jobs=64)
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

(62830, 216071)


  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.9933155080213903

In [12]:
y_pred = lr.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19657
           1       0.98      1.00      0.99       636
           2       0.92      0.88      0.90       133
           3       0.83      0.87      0.85       126
           4       0.94      0.98      0.96        97
           5       0.86      0.79      0.82        70
           6       0.89      0.63      0.73        75
           7       0.95      0.98      0.97        60
           8       0.97      0.57      0.72        51
           9       0.46      0.31      0.37        39

   micro avg       0.99      0.99      0.99     20944
   macro avg       0.88      0.80      0.83     20944
weighted avg       0.99      0.99      0.99     20944



# How sklearn works
if smooth_idf = True,then 
$$
idf_w = log\frac{|D| + 1}{|w| + 1}
$$
if smooth_idf = False, then
$$
idf_w = log\frac{|D| + 1}{|w| + 1}
$$
维基百科中说的平滑 idf 是分子加 1，分母不加 1.这样是有点小问题的，假设词 w 在所有文档中都出现过，那么 idf 就为
$$
idf_w = log\frac{|D|}{|D| + 1}
$$
and then idf becomes negative.
One more thing, sklearn add 1 to all idf. Why ???

sklearn use $e$ as log.

In [14]:
cc = [
    '你 小子 怎么 回事 小子',
    '能 行吗 你'
]
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(smooth_idf=True,norm=None, token_pattern='\\b\\w+\\b') 
tfidf = vector.fit_transform(cc) # 得到结果
wordlist = vector.get_feature_names()#获取词袋模型中的所有词  
print(wordlist)
weightlist = tfidf.toarray()  
print(weightlist)

['你', '回事', '小子', '怎么', '能', '行吗']
[[1.         1.40546511 2.81093022 1.40546511 0.         0.        ]
 [1.         0.         0.         0.         1.40546511 1.40546511]]
